diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97784c924dab46af4f40c291ef293b07b2997684 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__init__.py @@ -0,0 +1,57 @@ +""" +pandas._config is considered explicitly upstream of everything else in pandas, +should have no intra-pandas dependencies. + +importing `dates` and `display` ensures that keys needed by _libs +are initialized. +""" +__all__ = [ + "config", + "detect_console_encoding", + "get_option", + "set_option", + "reset_option", + "describe_option", + "option_context", + "options", + "using_copy_on_write", + "warn_copy_on_write", +] +from pandas._config import config +from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 +from pandas._config.config import ( + _global_config, + describe_option, + get_option, + option_context, + options, + reset_option, + set_option, +) +from pandas._config.display import detect_console_encoding + + +def using_copy_on_write() -> bool: + _mode_options = _global_config["mode"] + return ( + _mode_options["copy_on_write"] is True + and _mode_options["data_manager"] == "block" + ) + + +def warn_copy_on_write() -> bool: + _mode_options = _global_config["mode"] + return ( + _mode_options["copy_on_write"] == "warn" + and _mode_options["data_manager"] == "block" + ) + + +def using_nullable_dtypes() -> bool: + _mode_options = _global_config["mode"] + return _mode_options["nullable_dtypes"] + + +def using_pyarrow_string_dtype() -> bool: + _mode_options = _global_config["future"] + return _mode_options["infer_string"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/config.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c391939d22491099652e13ad81e83b201f140b60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/config.py @@ -0,0 +1,948 @@ +""" +The config module holds package-wide configurables and provides +a uniform API for working with them. + +Overview +======== + +This module supports the following requirements: +- options are referenced using keys in dot.notation, e.g. "x.y.option - z". +- keys are case-insensitive. +- functions should accept partial/regex keys, when unambiguous. +- options can be registered by modules at import time. +- options can be registered at init-time (via core.config_init) +- options have a default value, and (optionally) a description and + validation function associated with them. +- options can be deprecated, in which case referencing them + should produce a warning. +- deprecated options can optionally be rerouted to a replacement + so that accessing a deprecated option reroutes to a differently + named option. +- options can be reset to their default value. +- all option can be reset to their default value at once. +- all options in a certain sub - namespace can be reset at once. +- the user can set / get / reset or ask for the description of an option. +- a developer can register and mark an option as deprecated. +- you can register a callback to be invoked when the option value + is set or reset. Changing the stored value is considered misuse, but + is not verboten. + +Implementation +============== + +- Data is stored using nested dictionaries, and should be accessed + through the provided API. + +- "Registered options" and "Deprecated options" have metadata associated + with them, which are stored in auxiliary dictionaries keyed on the + fully-qualified key, e.g. "x.y.z.option". + +- the config_init module is imported by the package's __init__.py file. + placing any register_option() calls there will ensure those options + are available as soon as pandas is loaded. If you use register_option + in a module, it will only be available after that module is imported, + which you should be aware of. + +- `config_prefix` is a context_manager (for use with the `with` keyword) + which can save developers some typing, see the docstring. + +""" + +from __future__ import annotations + +from contextlib import ( + ContextDecorator, + contextmanager, +) +import re +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + NamedTuple, + cast, +) +import warnings + +from pandas._typing import ( + F, + T, +) +from pandas.util._exceptions import find_stack_level + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Iterable, + ) + + +class DeprecatedOption(NamedTuple): + key: str + msg: str | None + rkey: str | None + removal_ver: str | None + + +class RegisteredOption(NamedTuple): + key: str + defval: object + doc: str + validator: Callable[[object], Any] | None + cb: Callable[[str], Any] | None + + +# holds deprecated option metadata +_deprecated_options: dict[str, DeprecatedOption] = {} + +# holds registered option metadata +_registered_options: dict[str, RegisteredOption] = {} + +# holds the current values for registered options +_global_config: dict[str, Any] = {} + +# keys which have a special meaning +_reserved_keys: list[str] = ["all"] + + +class OptionError(AttributeError, KeyError): + """ + Exception raised for pandas.options. + + Backwards compatible with KeyError checks. + + Examples + -------- + >>> pd.options.context + Traceback (most recent call last): + OptionError: No such option + """ + + +# +# User API + + +def _get_single_key(pat: str, silent: bool) -> str: + keys = _select_options(pat) + if len(keys) == 0: + if not silent: + _warn_if_deprecated(pat) + raise OptionError(f"No such keys(s): {repr(pat)}") + if len(keys) > 1: + raise OptionError("Pattern matched multiple keys") + key = keys[0] + + if not silent: + _warn_if_deprecated(key) + + key = _translate_key(key) + + return key + + +def _get_option(pat: str, silent: bool = False) -> Any: + key = _get_single_key(pat, silent) + + # walk the nested dict + root, k = _get_root(key) + return root[k] + + +def _set_option(*args, **kwargs) -> None: + # must at least 1 arg deal with constraints later + nargs = len(args) + if not nargs or nargs % 2 != 0: + raise ValueError("Must provide an even number of non-keyword arguments") + + # default to false + silent = kwargs.pop("silent", False) + + if kwargs: + kwarg = next(iter(kwargs.keys())) + raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"') + + for k, v in zip(args[::2], args[1::2]): + key = _get_single_key(k, silent) + + o = _get_registered_option(key) + if o and o.validator: + o.validator(v) + + # walk the nested dict + root, k_root = _get_root(key) + root[k_root] = v + + if o.cb: + if silent: + with warnings.catch_warnings(record=True): + o.cb(key) + else: + o.cb(key) + + +def _describe_option(pat: str = "", _print_desc: bool = True) -> str | None: + keys = _select_options(pat) + if len(keys) == 0: + raise OptionError("No such keys(s)") + + s = "\n".join([_build_option_description(k) for k in keys]) + + if _print_desc: + print(s) + return None + return s + + +def _reset_option(pat: str, silent: bool = False) -> None: + keys = _select_options(pat) + + if len(keys) == 0: + raise OptionError("No such keys(s)") + + if len(keys) > 1 and len(pat) < 4 and pat != "all": + raise ValueError( + "You must specify at least 4 characters when " + "resetting multiple keys, use the special keyword " + '"all" to reset all the options to their default value' + ) + + for k in keys: + _set_option(k, _registered_options[k].defval, silent=silent) + + +def get_default_val(pat: str): + key = _get_single_key(pat, silent=True) + return _get_registered_option(key).defval + + +class DictWrapper: + """provide attribute-style access to a nested dict""" + + d: dict[str, Any] + + def __init__(self, d: dict[str, Any], prefix: str = "") -> None: + object.__setattr__(self, "d", d) + object.__setattr__(self, "prefix", prefix) + + def __setattr__(self, key: str, val: Any) -> None: + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + # you can't set new keys + # can you can't overwrite subtrees + if key in self.d and not isinstance(self.d[key], dict): + _set_option(prefix, val) + else: + raise OptionError("You can only set the value of existing options") + + def __getattr__(self, key: str): + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + try: + v = object.__getattribute__(self, "d")[key] + except KeyError as err: + raise OptionError("No such option") from err + if isinstance(v, dict): + return DictWrapper(v, prefix) + else: + return _get_option(prefix) + + def __dir__(self) -> list[str]: + return list(self.d.keys()) + + +# For user convenience, we'd like to have the available options described +# in the docstring. For dev convenience we'd like to generate the docstrings +# dynamically instead of maintaining them by hand. To this, we use the +# class below which wraps functions inside a callable, and converts +# __doc__ into a property function. The doctsrings below are templates +# using the py2.6+ advanced formatting syntax to plug in a concise list +# of options, and option descriptions. + + +class CallableDynamicDoc(Generic[T]): + def __init__(self, func: Callable[..., T], doc_tmpl: str) -> None: + self.__doc_tmpl__ = doc_tmpl + self.__func__ = func + + def __call__(self, *args, **kwds) -> T: + return self.__func__(*args, **kwds) + + # error: Signature of "__doc__" incompatible with supertype "object" + @property + def __doc__(self) -> str: # type: ignore[override] + opts_desc = _describe_option("all", _print_desc=False) + opts_list = pp_options_list(list(_registered_options.keys())) + return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list) + + +_get_option_tmpl = """ +get_option(pat) + +Retrieves the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. + +Returns +------- +result : the value of the option + +Raises +------ +OptionError : if no such option exists + +Notes +----- +Please reference the :ref:`User Guide ` for more information. + +The available options with its descriptions: + +{opts_desc} + +Examples +-------- +>>> pd.get_option('display.max_columns') # doctest: +SKIP +4 +""" + +_set_option_tmpl = """ +set_option(pat, value) + +Sets the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. +value : object + New value of option. + +Returns +------- +None + +Raises +------ +OptionError if no such option exists + +Notes +----- +Please reference the :ref:`User Guide ` for more information. + +The available options with its descriptions: + +{opts_desc} + +Examples +-------- +>>> pd.set_option('display.max_columns', 4) +>>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) +>>> df + 0 1 ... 3 4 +0 1 2 ... 4 5 +1 6 7 ... 9 10 +[2 rows x 5 columns] +>>> pd.reset_option('display.max_columns') +""" + +_describe_option_tmpl = """ +describe_option(pat, _print_desc=False) + +Prints the description for one or more registered options. + +Call with no arguments to get a listing for all registered options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp pattern. All matching keys will have their description displayed. +_print_desc : bool, default True + If True (default) the description(s) will be printed to stdout. + Otherwise, the description(s) will be returned as a unicode string + (for testing). + +Returns +------- +None by default, the description(s) as a unicode string if _print_desc +is False + +Notes +----- +Please reference the :ref:`User Guide ` for more information. + +The available options with its descriptions: + +{opts_desc} + +Examples +-------- +>>> pd.describe_option('display.max_columns') # doctest: +SKIP +display.max_columns : int + If max_cols is exceeded, switch to truncate view... +""" + +_reset_option_tmpl = """ +reset_option(pat) + +Reset one or more options to their default value. + +Pass "all" as argument to reset all options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str/regex + If specified only options matching `prefix*` will be reset. + Note: partial matches are supported for convenience, but unless you + use the full option name (e.g. x.y.z.option_name), your code may break + in future versions if new options with similar names are introduced. + +Returns +------- +None + +Notes +----- +Please reference the :ref:`User Guide ` for more information. + +The available options with its descriptions: + +{opts_desc} + +Examples +-------- +>>> pd.reset_option('display.max_columns') # doctest: +SKIP +""" + +# bind the functions with their docstrings into a Callable +# and use that as the functions exposed in pd.api +get_option = CallableDynamicDoc(_get_option, _get_option_tmpl) +set_option = CallableDynamicDoc(_set_option, _set_option_tmpl) +reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl) +describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl) +options = DictWrapper(_global_config) + +# +# Functions for use by pandas developers, in addition to User - api + + +class option_context(ContextDecorator): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + >>> from pandas import option_context + >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass + """ + + def __init__(self, *args) -> None: + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self) -> None: + self.undo = [(pat, _get_option(pat)) for pat, val in self.ops] + + for pat, val in self.ops: + _set_option(pat, val, silent=True) + + def __exit__(self, *args) -> None: + if self.undo: + for pat, val in self.undo: + _set_option(pat, val, silent=True) + + +def register_option( + key: str, + defval: object, + doc: str = "", + validator: Callable[[object], Any] | None = None, + cb: Callable[[str], Any] | None = None, +) -> None: + """ + Register an option in the package-wide pandas config object + + Parameters + ---------- + key : str + Fully-qualified key, e.g. "x.y.option - z". + defval : object + Default value of the option. + doc : str + Description of the option. + validator : Callable, optional + Function of a single argument, should raise `ValueError` if + called with a value which is not a legal value for the option. + cb + a function of a single argument "key", which is called + immediately after an option value is set/reset. key is + the full name of the option. + + Raises + ------ + ValueError if `validator` is specified and `defval` is not a valid value. + + """ + import keyword + import tokenize + + key = key.lower() + + if key in _registered_options: + raise OptionError(f"Option '{key}' has already been registered") + if key in _reserved_keys: + raise OptionError(f"Option '{key}' is a reserved key") + + # the default value should be legal + if validator: + validator(defval) + + # walk the nested dict, creating dicts as needed along the path + path = key.split(".") + + for k in path: + if not re.match("^" + tokenize.Name + "$", k): + raise ValueError(f"{k} is not a valid identifier") + if keyword.iskeyword(k): + raise ValueError(f"{k} is a python keyword") + + cursor = _global_config + msg = "Path prefix to option '{option}' is already an option" + + for i, p in enumerate(path[:-1]): + if not isinstance(cursor, dict): + raise OptionError(msg.format(option=".".join(path[:i]))) + if p not in cursor: + cursor[p] = {} + cursor = cursor[p] + + if not isinstance(cursor, dict): + raise OptionError(msg.format(option=".".join(path[:-1]))) + + cursor[path[-1]] = defval # initialize + + # save the option metadata + _registered_options[key] = RegisteredOption( + key=key, defval=defval, doc=doc, validator=validator, cb=cb + ) + + +def deprecate_option( + key: str, + msg: str | None = None, + rkey: str | None = None, + removal_ver: str | None = None, +) -> None: + """ + Mark option `key` as deprecated, if code attempts to access this option, + a warning will be produced, using `msg` if given, or a default message + if not. + if `rkey` is given, any access to the key will be re-routed to `rkey`. + + Neither the existence of `key` nor that if `rkey` is checked. If they + do not exist, any subsequence access will fail as usual, after the + deprecation warning is given. + + Parameters + ---------- + key : str + Name of the option to be deprecated. + must be a fully-qualified option name (e.g "x.y.z.rkey"). + msg : str, optional + Warning message to output when the key is referenced. + if no message is given a default message will be emitted. + rkey : str, optional + Name of an option to reroute access to. + If specified, any referenced `key` will be + re-routed to `rkey` including set/get/reset. + rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). + used by the default message if no `msg` is specified. + removal_ver : str, optional + Specifies the version in which this option will + be removed. used by the default message if no `msg` is specified. + + Raises + ------ + OptionError + If the specified key has already been deprecated. + """ + key = key.lower() + + if key in _deprecated_options: + raise OptionError(f"Option '{key}' has already been defined as deprecated.") + + _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) + + +# +# functions internal to the module + + +def _select_options(pat: str) -> list[str]: + """ + returns a list of keys matching `pat` + + if pat=="all", returns all registered options + """ + # short-circuit for exact key + if pat in _registered_options: + return [pat] + + # else look through all of them + keys = sorted(_registered_options.keys()) + if pat == "all": # reserved key + return keys + + return [k for k in keys if re.search(pat, k, re.I)] + + +def _get_root(key: str) -> tuple[dict[str, Any], str]: + path = key.split(".") + cursor = _global_config + for p in path[:-1]: + cursor = cursor[p] + return cursor, path[-1] + + +def _is_deprecated(key: str) -> bool: + """Returns True if the given option has been deprecated""" + key = key.lower() + return key in _deprecated_options + + +def _get_deprecated_option(key: str): + """ + Retrieves the metadata for a deprecated option, if `key` is deprecated. + + Returns + ------- + DeprecatedOption (namedtuple) if key is deprecated, None otherwise + """ + try: + d = _deprecated_options[key] + except KeyError: + return None + else: + return d + + +def _get_registered_option(key: str): + """ + Retrieves the option metadata if `key` is a registered option. + + Returns + ------- + RegisteredOption (namedtuple) if key is deprecated, None otherwise + """ + return _registered_options.get(key) + + +def _translate_key(key: str) -> str: + """ + if key id deprecated and a replacement key defined, will return the + replacement key, otherwise returns `key` as - is + """ + d = _get_deprecated_option(key) + if d: + return d.rkey or key + else: + return key + + +def _warn_if_deprecated(key: str) -> bool: + """ + Checks if `key` is a deprecated option and if so, prints a warning. + + Returns + ------- + bool - True if `key` is deprecated, False otherwise. + """ + d = _get_deprecated_option(key) + if d: + if d.msg: + warnings.warn( + d.msg, + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + msg = f"'{key}' is deprecated" + if d.removal_ver: + msg += f" and will be removed in {d.removal_ver}" + if d.rkey: + msg += f", please use '{d.rkey}' instead." + else: + msg += ", please refrain from using it." + + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + return True + return False + + +def _build_option_description(k: str) -> str: + """Builds a formatted description of a registered option and prints it""" + o = _get_registered_option(k) + d = _get_deprecated_option(k) + + s = f"{k} " + + if o.doc: + s += "\n".join(o.doc.strip().split("\n")) + else: + s += "No description available." + + if o: + s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" + + if d: + rkey = d.rkey or "" + s += "\n (Deprecated" + s += f", use `{rkey}` instead." + s += ")" + + return s + + +def pp_options_list(keys: Iterable[str], width: int = 80, _print: bool = False): + """Builds a concise listing of available options, grouped by prefix""" + from itertools import groupby + from textwrap import wrap + + def pp(name: str, ks: Iterable[str]) -> list[str]: + pfx = "- " + name + ".[" if name else "" + ls = wrap( + ", ".join(ks), + width, + initial_indent=pfx, + subsequent_indent=" ", + break_long_words=False, + ) + if ls and ls[-1] and name: + ls[-1] = ls[-1] + "]" + return ls + + ls: list[str] = [] + singles = [x for x in sorted(keys) if x.find(".") < 0] + if singles: + ls += pp("", singles) + keys = [x for x in keys if x.find(".") >= 0] + + for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]): + ks = [x[len(k) + 1 :] for x in list(g)] + ls += pp(k, ks) + s = "\n".join(ls) + if _print: + print(s) + else: + return s + + +# +# helpers + + +@contextmanager +def config_prefix(prefix: str) -> Generator[None, None, None]: + """ + contextmanager for multiple invocations of API with a common prefix + + supported API functions: (register / get / set )__option + + Warning: This is not thread - safe, and won't work properly if you import + the API functions into your module using the "from x import y" construct. + + Example + ------- + import pandas._config.config as cf + with cf.config_prefix("display.font"): + cf.register_option("color", "red") + cf.register_option("size", " 5 pt") + cf.set_option(size, " 6 pt") + cf.get_option(size) + ... + + etc' + + will register options "display.font.color", "display.font.size", set the + value of "display.font.size"... and so on. + """ + # Note: reset_option relies on set_option, and on key directly + # it does not fit in to this monkey-patching scheme + + global register_option, get_option, set_option + + def wrap(func: F) -> F: + def inner(key: str, *args, **kwds): + pkey = f"{prefix}.{key}" + return func(pkey, *args, **kwds) + + return cast(F, inner) + + _register_option = register_option + _get_option = get_option + _set_option = set_option + set_option = wrap(set_option) + get_option = wrap(get_option) + register_option = wrap(register_option) + try: + yield + finally: + set_option = _set_option + get_option = _get_option + register_option = _register_option + + +# These factories and methods are handy for use as the validator +# arg in register_option + + +def is_type_factory(_type: type[Any]) -> Callable[[Any], None]: + """ + + Parameters + ---------- + `_type` - a type to be compared against (e.g. type(x) == `_type`) + + Returns + ------- + validator - a function of a single argument x , which raises + ValueError if type(x) is not equal to `_type` + + """ + + def inner(x) -> None: + if type(x) != _type: + raise ValueError(f"Value must have type '{_type}'") + + return inner + + +def is_instance_factory(_type) -> Callable[[Any], None]: + """ + + Parameters + ---------- + `_type` - the type to be checked against + + Returns + ------- + validator - a function of a single argument x , which raises + ValueError if x is not an instance of `_type` + + """ + if isinstance(_type, (tuple, list)): + _type = tuple(_type) + type_repr = "|".join(map(str, _type)) + else: + type_repr = f"'{_type}'" + + def inner(x) -> None: + if not isinstance(x, _type): + raise ValueError(f"Value must be an instance of {type_repr}") + + return inner + + +def is_one_of_factory(legal_values) -> Callable[[Any], None]: + callables = [c for c in legal_values if callable(c)] + legal_values = [c for c in legal_values if not callable(c)] + + def inner(x) -> None: + if x not in legal_values: + if not any(c(x) for c in callables): + uvals = [str(lval) for lval in legal_values] + pp_values = "|".join(uvals) + msg = f"Value must be one of {pp_values}" + if len(callables): + msg += " or a callable" + raise ValueError(msg) + + return inner + + +def is_nonnegative_int(value: object) -> None: + """ + Verify that value is None or a positive int. + + Parameters + ---------- + value : None or int + The `value` to be checked. + + Raises + ------ + ValueError + When the value is not None or is a negative integer + """ + if value is None: + return + + elif isinstance(value, int): + if value >= 0: + return + + msg = "Value must be a nonnegative integer or None" + raise ValueError(msg) + + +# common type validators, for convenience +# usage: register_option(... , validator = is_int) +is_int = is_type_factory(int) +is_bool = is_type_factory(bool) +is_float = is_type_factory(float) +is_str = is_type_factory(str) +is_text = is_instance_factory((str, bytes)) + + +def is_callable(obj) -> bool: + """ + + Parameters + ---------- + `obj` - the object to be checked + + Returns + ------- + validator - returns True if object is callable + raises ValueError otherwise. + + """ + if not callable(obj): + raise ValueError("Value must be a callable") + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/dates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/dates.py new file mode 100644 index 0000000000000000000000000000000000000000..b37831f96eb73bf2f128929a1769db6c141eebad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/dates.py @@ -0,0 +1,25 @@ +""" +config for datetime formatting +""" +from __future__ import annotations + +from pandas._config import config as cf + +pc_date_dayfirst_doc = """ +: boolean + When True, prints and parses dates with the day first, eg 20/01/2005 +""" + +pc_date_yearfirst_doc = """ +: boolean + When True, prints and parses dates with the year first, eg 2005/01/20 +""" + +with cf.config_prefix("display"): + # Needed upstream of `_libs` because these are used in tslibs.parsing + cf.register_option( + "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool + ) + cf.register_option( + "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/display.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/display.py new file mode 100644 index 0000000000000000000000000000000000000000..df2c3ad36c855d77c33d80c78c3d83ab3c09d5f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/display.py @@ -0,0 +1,62 @@ +""" +Unopinionated display configuration. +""" + +from __future__ import annotations + +import locale +import sys + +from pandas._config import config as cf + +# ----------------------------------------------------------------------------- +# Global formatting options +_initial_defencoding: str | None = None + + +def detect_console_encoding() -> str: + """ + Try to find the most capable encoding supported by the console. + slightly modified from the way IPython handles the same issue. + """ + global _initial_defencoding + + encoding = None + try: + encoding = sys.stdout.encoding or sys.stdin.encoding + except (AttributeError, OSError): + pass + + # try again for something better + if not encoding or "ascii" in encoding.lower(): + try: + encoding = locale.getpreferredencoding() + except locale.Error: + # can be raised by locale.setlocale(), which is + # called by getpreferredencoding + # (on some systems, see stdlib locale docs) + pass + + # when all else fails. this will usually be "ascii" + if not encoding or "ascii" in encoding.lower(): + encoding = sys.getdefaultencoding() + + # GH#3360, save the reported defencoding at import time + # MPL backends may change it. Make available for debugging. + if not _initial_defencoding: + _initial_defencoding = sys.getdefaultencoding() + + return encoding + + +pc_encoding_doc = """ +: str/unicode + Defaults to the detected encoding of the console. + Specifies the encoding to be used for strings returned by to_string, + these are generally strings meant to be displayed on the console. +""" + +with cf.config_prefix("display"): + cf.register_option( + "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/localization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/localization.py new file mode 100644 index 0000000000000000000000000000000000000000..5c1a0ff1395334a55baa6c5d77a71635872fe824 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/localization.py @@ -0,0 +1,172 @@ +""" +Helpers for configuring locale settings. + +Name `localization` is chosen to avoid overlap with builtin `locale` module. +""" +from __future__ import annotations + +from contextlib import contextmanager +import locale +import platform +import re +import subprocess +from typing import TYPE_CHECKING + +from pandas._config.config import options + +if TYPE_CHECKING: + from collections.abc import Generator + + +@contextmanager +def set_locale( + new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL +) -> Generator[str | tuple[str, str], None, None]: + """ + Context manager for temporarily setting a locale. + + Parameters + ---------- + new_locale : str or tuple + A string of the form .. For example to set + the current locale to US English with a UTF8 encoding, you would pass + "en_US.UTF-8". + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. + + Notes + ----- + This is useful when you want to run a particular block of code under a + particular locale, without globally setting the locale. This probably isn't + thread-safe. + """ + # getlocale is not always compliant with setlocale, use setlocale. GH#46595 + current_locale = locale.setlocale(lc_var) + + try: + locale.setlocale(lc_var, new_locale) + normalized_code, normalized_encoding = locale.getlocale() + if normalized_code is not None and normalized_encoding is not None: + yield f"{normalized_code}.{normalized_encoding}" + else: + yield new_locale + finally: + locale.setlocale(lc_var, current_locale) + + +def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool: + """ + Check to see if we can set a locale, and subsequently get the locale, + without raising an Exception. + + Parameters + ---------- + lc : str + The locale to attempt to set. + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. + + Returns + ------- + bool + Whether the passed locale can be set + """ + try: + with set_locale(lc, lc_var=lc_var): + pass + except (ValueError, locale.Error): + # horrible name for a Exception subclass + return False + else: + return True + + +def _valid_locales(locales: list[str] | str, normalize: bool) -> list[str]: + """ + Return a list of normalized locales that do not throw an ``Exception`` + when set. + + Parameters + ---------- + locales : str + A string where each locale is separated by a newline. + normalize : bool + Whether to call ``locale.normalize`` on each locale. + + Returns + ------- + valid_locales : list + A list of valid locales. + """ + return [ + loc + for loc in ( + locale.normalize(loc.strip()) if normalize else loc.strip() + for loc in locales + ) + if can_set_locale(loc) + ] + + +def get_locales( + prefix: str | None = None, + normalize: bool = True, +) -> list[str]: + """ + Get all the locales that are available on the system. + + Parameters + ---------- + prefix : str + If not ``None`` then return only those locales with the prefix + provided. For example to get all English language locales (those that + start with ``"en"``), pass ``prefix="en"``. + normalize : bool + Call ``locale.normalize`` on the resulting list of available locales. + If ``True``, only locales that can be set without throwing an + ``Exception`` are returned. + + Returns + ------- + locales : list of strings + A list of locale strings that can be set with ``locale.setlocale()``. + For example:: + + locale.setlocale(locale.LC_ALL, locale_string) + + On error will return an empty list (no locale available, e.g. Windows) + + """ + if platform.system() in ("Linux", "Darwin"): + raw_locales = subprocess.check_output(["locale", "-a"]) + else: + # Other platforms e.g. windows platforms don't define "locale -a" + # Note: is_platform_windows causes circular import here + return [] + + try: + # raw_locales is "\n" separated list of locales + # it may contain non-decodable parts, so split + # extract what we can and then rejoin. + split_raw_locales = raw_locales.split(b"\n") + out_locales = [] + for x in split_raw_locales: + try: + out_locales.append(str(x, encoding=options.display.encoding)) + except UnicodeError: + # 'locale -a' is used to populated 'raw_locales' and on + # Redhat 7 Linux (and maybe others) prints locale names + # using windows-1252 encoding. Bug only triggered by + # a few special characters and when there is an + # extensive list of installed locales. + out_locales.append(str(x, encoding="windows-1252")) + + except TypeError: + pass + + if prefix is None: + return _valid_locales(out_locales, normalize) + + pattern = re.compile(f"{prefix}.*") + found = pattern.findall("\n".join(out_locales)) + return _valid_locales(found, normalize) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..26a872a90e4934c929d168228125f2e43936c774 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__init__.py @@ -0,0 +1,27 @@ +__all__ = [ + "NaT", + "NaTType", + "OutOfBoundsDatetime", + "Period", + "Timedelta", + "Timestamp", + "iNaT", + "Interval", +] + + +# Below imports needs to happen first to ensure pandas top level +# module gets monkeypatched with the pandas_datetime_CAPI +# see pandas_datetime_exec in pd_datetime.c +import pandas._libs.pandas_parser # isort: skip # type: ignore[reportUnusedImport] +import pandas._libs.pandas_datetime # noqa: F401 # isort: skip # type: ignore[reportUnusedImport] +from pandas._libs.interval import Interval +from pandas._libs.tslibs import ( + NaT, + NaTType, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + iNaT, +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/algos.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/algos.pyi new file mode 100644 index 0000000000000000000000000000000000000000..caf5425dfc7b44cbb2fea103e56d4584709eca1e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/algos.pyi @@ -0,0 +1,416 @@ +from typing import Any + +import numpy as np + +from pandas._typing import npt + +class Infinity: + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + def __lt__(self, other) -> bool: ... + def __le__(self, other) -> bool: ... + def __gt__(self, other) -> bool: ... + def __ge__(self, other) -> bool: ... + +class NegInfinity: + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + def __lt__(self, other) -> bool: ... + def __le__(self, other) -> bool: ... + def __gt__(self, other) -> bool: ... + def __ge__(self, other) -> bool: ... + +def unique_deltas( + arr: np.ndarray, # const int64_t[:] +) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] +def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ... +def groupsort_indexer( + index: np.ndarray, # const int64_t[:] + ngroups: int, +) -> tuple[ + np.ndarray, # ndarray[int64_t, ndim=1] + np.ndarray, # ndarray[int64_t, ndim=1] +]: ... +def kth_smallest( + arr: np.ndarray, # numeric[:] + k: int, +) -> Any: ... # numeric + +# ---------------------------------------------------------------------- +# Pairwise correlation/covariance + +def nancorr( + mat: npt.NDArray[np.float64], # const float64_t[:, :] + cov: bool = ..., + minp: int | None = ..., +) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2] +def nancorr_spearman( + mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2] + minp: int = ..., +) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2] + +# ---------------------------------------------------------------------- + +def validate_limit(nobs: int | None, limit=...) -> int: ... +def get_fill_indexer( + mask: npt.NDArray[np.bool_], + limit: int | None = None, +) -> npt.NDArray[np.intp]: ... +def pad( + old: np.ndarray, # ndarray[numeric_object_t] + new: np.ndarray, # ndarray[numeric_object_t] + limit=..., +) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1] +def pad_inplace( + values: np.ndarray, # numeric_object_t[:] + mask: np.ndarray, # uint8_t[:] + limit=..., +) -> None: ... +def pad_2d_inplace( + values: np.ndarray, # numeric_object_t[:, :] + mask: np.ndarray, # const uint8_t[:, :] + limit=..., +) -> None: ... +def backfill( + old: np.ndarray, # ndarray[numeric_object_t] + new: np.ndarray, # ndarray[numeric_object_t] + limit=..., +) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1] +def backfill_inplace( + values: np.ndarray, # numeric_object_t[:] + mask: np.ndarray, # uint8_t[:] + limit=..., +) -> None: ... +def backfill_2d_inplace( + values: np.ndarray, # numeric_object_t[:, :] + mask: np.ndarray, # const uint8_t[:, :] + limit=..., +) -> None: ... +def is_monotonic( + arr: np.ndarray, # ndarray[numeric_object_t, ndim=1] + timelike: bool, +) -> tuple[bool, bool, bool]: ... + +# ---------------------------------------------------------------------- +# rank_1d, rank_2d +# ---------------------------------------------------------------------- + +def rank_1d( + values: np.ndarray, # ndarray[numeric_object_t, ndim=1] + labels: np.ndarray | None = ..., # const int64_t[:]=None + is_datetimelike: bool = ..., + ties_method=..., + ascending: bool = ..., + pct: bool = ..., + na_option=..., + mask: npt.NDArray[np.bool_] | None = ..., +) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] +def rank_2d( + in_arr: np.ndarray, # ndarray[numeric_object_t, ndim=2] + axis: int = ..., + is_datetimelike: bool = ..., + ties_method=..., + ascending: bool = ..., + na_option=..., + pct: bool = ..., +) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] +def diff_2d( + arr: np.ndarray, # ndarray[diff_t, ndim=2] + out: np.ndarray, # ndarray[out_t, ndim=2] + periods: int, + axis: int, + datetimelike: bool = ..., +) -> None: ... +def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ... +def ensure_object(arr: object) -> npt.NDArray[np.object_]: ... +def ensure_float64(arr: object) -> npt.NDArray[np.float64]: ... +def ensure_int8(arr: object) -> npt.NDArray[np.int8]: ... +def ensure_int16(arr: object) -> npt.NDArray[np.int16]: ... +def ensure_int32(arr: object) -> npt.NDArray[np.int32]: ... +def ensure_int64(arr: object) -> npt.NDArray[np.int64]: ... +def ensure_uint64(arr: object) -> npt.NDArray[np.uint64]: ... +def take_1d_int8_int8( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int8_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int16_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int64_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_int64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float32_float32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_float64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_object_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_bool_bool( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_bool_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int8( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int8_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int16_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int64_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_int64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float32_float32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_float64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_object_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_bool_bool( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_bool_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int8( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int8_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int16_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_int32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int64_int64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_int64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float32_float32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float32_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_float64_float64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_object_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_bool_bool( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_bool_object( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_multi_int8_int8( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int8_int32( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int8_int64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int8_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int16_int16( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int16_int32( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int16_int64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int16_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int32_int32( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int32_int64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int32_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int64_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_float32_float32( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_float32_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_float64_float64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_object_object( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_bool_bool( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_bool_object( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... +def take_2d_multi_int64_int64( + values: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value=..., +) -> None: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/arrays.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/arrays.pyi new file mode 100644 index 0000000000000000000000000000000000000000..86f69c3cdfc75f3101a8e0afe59f9b102d90ad79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/arrays.pyi @@ -0,0 +1,40 @@ +from typing import Sequence + +import numpy as np + +from pandas._typing import ( + AxisInt, + DtypeObj, + Self, + Shape, +) + +class NDArrayBacked: + _dtype: DtypeObj + _ndarray: np.ndarray + def __init__(self, values: np.ndarray, dtype: DtypeObj) -> None: ... + @classmethod + def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ... + def _from_backing_data(self, values: np.ndarray): ... + def __setstate__(self, state): ... + def __len__(self) -> int: ... + @property + def shape(self) -> Shape: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> int: ... + @property + def nbytes(self) -> int: ... + def copy(self, order=...): ... + def delete(self, loc, axis=...): ... + def swapaxes(self, axis1, axis2): ... + def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... + def reshape(self, *args, **kwargs): ... + def ravel(self, order=...): ... + @property + def T(self): ... + @classmethod + def _concat_same_type( + cls, to_concat: Sequence[Self], axis: AxisInt = ... + ) -> Self: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..2de09e64a0e494775e7df90f26b3beadab07c2d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.pyi new file mode 100644 index 0000000000000000000000000000000000000000..bb0dbfc6a50b1bb7cd509dc5b3dfeed55ad70b09 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/byteswap.pyi @@ -0,0 +1,5 @@ +def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/groupby.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/groupby.pyi new file mode 100644 index 0000000000000000000000000000000000000000..a494b61fa7e3d70c77a921eb8bfcc269c76db9fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/groupby.pyi @@ -0,0 +1,216 @@ +from typing import Literal + +import numpy as np + +from pandas._typing import npt + +def group_median_float64( + out: np.ndarray, # ndarray[float64_t, ndim=2] + counts: npt.NDArray[np.int64], + values: np.ndarray, # ndarray[float64_t, ndim=2] + labels: npt.NDArray[np.int64], + min_count: int = ..., # Py_ssize_t + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_cumprod( + out: np.ndarray, # float64_t[:, ::1] + values: np.ndarray, # const float64_t[:, :] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + is_datetimelike: bool, + skipna: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_cumsum( + out: np.ndarray, # int64float_t[:, ::1] + values: np.ndarray, # ndarray[int64float_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + is_datetimelike: bool, + skipna: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_shift_indexer( + out: np.ndarray, # int64_t[::1] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + periods: int, +) -> None: ... +def group_fillna_indexer( + out: np.ndarray, # ndarray[intp_t] + labels: np.ndarray, # ndarray[int64_t] + sorted_labels: npt.NDArray[np.intp], + mask: npt.NDArray[np.uint8], + limit: int, # int64_t + dropna: bool, +) -> None: ... +def group_any_all( + out: np.ndarray, # uint8_t[::1] + values: np.ndarray, # const uint8_t[::1] + labels: np.ndarray, # const int64_t[:] + mask: np.ndarray, # const uint8_t[::1] + val_test: Literal["any", "all"], + skipna: bool, + result_mask: np.ndarray | None, +) -> None: ... +def group_sum( + out: np.ndarray, # complexfloatingintuint_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[complexfloatingintuint_t, ndim=2] + labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None, + result_mask: np.ndarray | None = ..., + min_count: int = ..., + is_datetimelike: bool = ..., +) -> None: ... +def group_prod( + out: np.ndarray, # int64float_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[int64float_t, ndim=2] + labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None, + result_mask: np.ndarray | None = ..., + min_count: int = ..., +) -> None: ... +def group_var( + out: np.ndarray, # floating[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[floating, ndim=2] + labels: np.ndarray, # const intp_t[:] + min_count: int = ..., # Py_ssize_t + ddof: int = ..., # int64_t + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + is_datetimelike: bool = ..., + name: str = ..., +) -> None: ... +def group_skew( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... +def group_mean( + out: np.ndarray, # floating[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[floating, ndim=2] + labels: np.ndarray, # const intp_t[:] + min_count: int = ..., # Py_ssize_t + is_datetimelike: bool = ..., # bint + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_ohlc( + out: np.ndarray, # floatingintuint_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[floatingintuint_t, ndim=2] + labels: np.ndarray, # const intp_t[:] + min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_quantile( + out: npt.NDArray[np.float64], + values: np.ndarray, # ndarray[numeric, ndim=1] + labels: npt.NDArray[np.intp], + mask: npt.NDArray[np.uint8], + qs: npt.NDArray[np.float64], # const + starts: npt.NDArray[np.int64], + ends: npt.NDArray[np.int64], + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], + result_mask: np.ndarray | None, + is_datetimelike: bool, +) -> None: ... +def group_last( + out: np.ndarray, # rank_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[rank_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None = ..., + min_count: int = ..., # Py_ssize_t + is_datetimelike: bool = ..., + skipna: bool = ..., +) -> None: ... +def group_nth( + out: np.ndarray, # rank_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[rank_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None = ..., + min_count: int = ..., # int64_t + rank: int = ..., # int64_t + is_datetimelike: bool = ..., + skipna: bool = ..., +) -> None: ... +def group_rank( + out: np.ndarray, # float64_t[:, ::1] + values: np.ndarray, # ndarray[rank_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + is_datetimelike: bool, + ties_method: Literal["average", "min", "max", "first", "dense"] = ..., + ascending: bool = ..., + pct: bool = ..., + na_option: Literal["keep", "top", "bottom"] = ..., + mask: npt.NDArray[np.bool_] | None = ..., +) -> None: ... +def group_max( + out: np.ndarray, # groupby_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + min_count: int = ..., + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_min( + out: np.ndarray, # groupby_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + min_count: int = ..., + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_idxmin_idxmax( + out: npt.NDArray[np.intp], + counts: npt.NDArray[np.int64], + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: npt.NDArray[np.intp], + min_count: int = ..., + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + name: str = ..., + skipna: bool = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... +def group_cummin( + out: np.ndarray, # groupby_t[:, ::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + is_datetimelike: bool, + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... +def group_cummax( + out: np.ndarray, # groupby_t[:, ::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + ngroups: int, + is_datetimelike: bool, + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashing.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashing.pyi new file mode 100644 index 0000000000000000000000000000000000000000..8361026e4a87d462e04c53f7f1f8aee8a7f6ffe0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashing.pyi @@ -0,0 +1,9 @@ +import numpy as np + +from pandas._typing import npt + +def hash_object_array( + arr: npt.NDArray[np.object_], + key: str, + encoding: str = ..., +) -> npt.NDArray[np.uint64]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashtable.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashtable.pyi new file mode 100644 index 0000000000000000000000000000000000000000..555ec73acd9b2e9ccbb5ea145db77862e2c54dbf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/hashtable.pyi @@ -0,0 +1,252 @@ +from typing import ( + Any, + Hashable, + Literal, +) + +import numpy as np + +from pandas._typing import npt + +def unique_label_indices( + labels: np.ndarray, # const int64_t[:] +) -> np.ndarray: ... + +class Factorizer: + count: int + uniques: Any + def __init__(self, size_hint: int) -> None: ... + def get_count(self) -> int: ... + def factorize( + self, + values: np.ndarray, + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class ObjectFactorizer(Factorizer): + table: PyObjectHashTable + uniques: ObjectVector + +class Int64Factorizer(Factorizer): + table: Int64HashTable + uniques: Int64Vector + +class UInt64Factorizer(Factorizer): + table: UInt64HashTable + uniques: UInt64Vector + +class Int32Factorizer(Factorizer): + table: Int32HashTable + uniques: Int32Vector + +class UInt32Factorizer(Factorizer): + table: UInt32HashTable + uniques: UInt32Vector + +class Int16Factorizer(Factorizer): + table: Int16HashTable + uniques: Int16Vector + +class UInt16Factorizer(Factorizer): + table: UInt16HashTable + uniques: UInt16Vector + +class Int8Factorizer(Factorizer): + table: Int8HashTable + uniques: Int8Vector + +class UInt8Factorizer(Factorizer): + table: UInt8HashTable + uniques: UInt8Vector + +class Float64Factorizer(Factorizer): + table: Float64HashTable + uniques: Float64Vector + +class Float32Factorizer(Factorizer): + table: Float32HashTable + uniques: Float32Vector + +class Complex64Factorizer(Factorizer): + table: Complex64HashTable + uniques: Complex64Vector + +class Complex128Factorizer(Factorizer): + table: Complex128HashTable + uniques: Complex128Vector + +class Int64Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.int64]: ... + +class Int32Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.int32]: ... + +class Int16Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.int16]: ... + +class Int8Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.int8]: ... + +class UInt64Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.uint64]: ... + +class UInt32Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.uint32]: ... + +class UInt16Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.uint16]: ... + +class UInt8Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.uint8]: ... + +class Float64Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.float64]: ... + +class Float32Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.float32]: ... + +class Complex128Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.complex128]: ... + +class Complex64Vector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.complex64]: ... + +class StringVector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.object_]: ... + +class ObjectVector: + def __init__(self, *args) -> None: ... + def __len__(self) -> int: ... + def to_array(self) -> npt.NDArray[np.object_]: ... + +class HashTable: + # NB: The base HashTable class does _not_ actually have these methods; + # we are putting them here for the sake of mypy to avoid + # reproducing them in each subclass below. + def __init__(self, size_hint: int = ..., uses_mask: bool = ...) -> None: ... + def __len__(self) -> int: ... + def __contains__(self, key: Hashable) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def get_state(self) -> dict[str, int]: ... + # TODO: `val/key` type is subclass-specific + def get_item(self, val): ... # TODO: return type? + def set_item(self, key, val) -> None: ... + def get_na(self): ... # TODO: return type? + def set_na(self, val) -> None: ... + def map_locations( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + mask: npt.NDArray[np.bool_] | None = ..., + ) -> None: ... + def lookup( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + mask: npt.NDArray[np.bool_] | None = ..., + ) -> npt.NDArray[np.intp]: ... + def get_labels( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + uniques, # SubclassTypeVector + count_prior: int = ..., + na_sentinel: int = ..., + na_value: object = ..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + def unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + return_inverse: bool = ..., + mask=..., + ) -> ( + tuple[ + np.ndarray, # np.ndarray[subclass-specific] + npt.NDArray[np.intp], + ] + | np.ndarray + ): ... # np.ndarray[subclass-specific] + def factorize( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + na_sentinel: int = ..., + na_value: object = ..., + mask=..., + ignore_na: bool = True, + ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] + +class Complex128HashTable(HashTable): ... +class Complex64HashTable(HashTable): ... +class Float64HashTable(HashTable): ... +class Float32HashTable(HashTable): ... + +class Int64HashTable(HashTable): + # Only Int64HashTable has get_labels_groupby, map_keys_to_values + def get_labels_groupby( + self, + values: npt.NDArray[np.int64], # const int64_t[:] + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: ... + def map_keys_to_values( + self, + keys: npt.NDArray[np.int64], + values: npt.NDArray[np.int64], # const int64_t[:] + ) -> None: ... + +class Int32HashTable(HashTable): ... +class Int16HashTable(HashTable): ... +class Int8HashTable(HashTable): ... +class UInt64HashTable(HashTable): ... +class UInt32HashTable(HashTable): ... +class UInt16HashTable(HashTable): ... +class UInt8HashTable(HashTable): ... +class StringHashTable(HashTable): ... +class PyObjectHashTable(HashTable): ... +class IntpHashTable(HashTable): ... + +def duplicated( + values: np.ndarray, + keep: Literal["last", "first", False] = ..., + mask: npt.NDArray[np.bool_] | None = ..., +) -> npt.NDArray[np.bool_]: ... +def mode( + values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ... +) -> np.ndarray: ... +def value_count( + values: np.ndarray, + dropna: bool, + mask: npt.NDArray[np.bool_] | None = ..., +) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values] + +# arr and values should have same dtype +def ismember( + arr: np.ndarray, + values: np.ndarray, +) -> npt.NDArray[np.bool_]: ... +def object_hash(obj) -> int: ... +def objects_are_equal(a, b) -> bool: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/index.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/index.pyi new file mode 100644 index 0000000000000000000000000000000000000000..75db47bf3160e828b178c0cd733452c9dc5e8919 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/index.pyi @@ -0,0 +1,100 @@ +import numpy as np + +from pandas._typing import npt + +from pandas import MultiIndex +from pandas.core.arrays import ExtensionArray + +multiindex_nulls_shift: int + +class IndexEngine: + over_size_threshold: bool + def __init__(self, values: np.ndarray) -> None: ... + def __contains__(self, val: object) -> bool: ... + + # -> int | slice | np.ndarray[bool] + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + def sizeof(self, deep: bool = ...) -> int: ... + def __sizeof__(self) -> int: ... + @property + def is_unique(self) -> bool: ... + @property + def is_monotonic_increasing(self) -> bool: ... + @property + def is_monotonic_decreasing(self) -> bool: ... + @property + def is_mapping_populated(self) -> bool: ... + def clear_mapping(self): ... + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + +class MaskedIndexEngine(IndexEngine): + def __init__(self, values: object) -> None: ... + def get_indexer_non_unique( + self, targets: object + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + +class Float64Engine(IndexEngine): ... +class Float32Engine(IndexEngine): ... +class Complex128Engine(IndexEngine): ... +class Complex64Engine(IndexEngine): ... +class Int64Engine(IndexEngine): ... +class Int32Engine(IndexEngine): ... +class Int16Engine(IndexEngine): ... +class Int8Engine(IndexEngine): ... +class UInt64Engine(IndexEngine): ... +class UInt32Engine(IndexEngine): ... +class UInt16Engine(IndexEngine): ... +class UInt8Engine(IndexEngine): ... +class ObjectEngine(IndexEngine): ... +class DatetimeEngine(Int64Engine): ... +class TimedeltaEngine(DatetimeEngine): ... +class PeriodEngine(Int64Engine): ... +class BoolEngine(UInt8Engine): ... +class MaskedFloat64Engine(MaskedIndexEngine): ... +class MaskedFloat32Engine(MaskedIndexEngine): ... +class MaskedComplex128Engine(MaskedIndexEngine): ... +class MaskedComplex64Engine(MaskedIndexEngine): ... +class MaskedInt64Engine(MaskedIndexEngine): ... +class MaskedInt32Engine(MaskedIndexEngine): ... +class MaskedInt16Engine(MaskedIndexEngine): ... +class MaskedInt8Engine(MaskedIndexEngine): ... +class MaskedUInt64Engine(MaskedIndexEngine): ... +class MaskedUInt32Engine(MaskedIndexEngine): ... +class MaskedUInt16Engine(MaskedIndexEngine): ... +class MaskedUInt8Engine(MaskedIndexEngine): ... +class MaskedBoolEngine(MaskedUInt8Engine): ... + +class BaseMultiIndexCodesEngine: + levels: list[np.ndarray] + offsets: np.ndarray # ndarray[uint64_t, ndim=1] + + def __init__( + self, + levels: list[np.ndarray], # all entries hashable + labels: list[np.ndarray], # all entries integer-dtyped + offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + ) -> None: ... + def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... + def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... + +class ExtensionEngine: + def __init__(self, values: ExtensionArray) -> None: ... + def __contains__(self, val: object) -> bool: ... + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + @property + def is_unique(self) -> bool: ... + @property + def is_monotonic_increasing(self) -> bool: ... + @property + def is_monotonic_decreasing(self) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def clear_mapping(self): ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..ec244cfdf1b2bcbbff9f3a920f11fbd45edb1efe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.pyi new file mode 100644 index 0000000000000000000000000000000000000000..3ae5c5044a2f75452fa57ba578af2c7b4c78ec96 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/indexing.pyi @@ -0,0 +1,17 @@ +from typing import ( + Generic, + TypeVar, +) + +from pandas.core.indexing import IndexingMixin + +_IndexingMixinT = TypeVar("_IndexingMixinT", bound=IndexingMixin) + +class NDFrameIndexerBase(Generic[_IndexingMixinT]): + name: str + # in practice obj is either a DataFrame or a Series + obj: _IndexingMixinT + + def __init__(self, name: str, obj: _IndexingMixinT) -> None: ... + @property + def ndim(self) -> int: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/internals.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/internals.pyi new file mode 100644 index 0000000000000000000000000000000000000000..ffe6c7730bcdcfb8af1407bf55b59170c4093699 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/internals.pyi @@ -0,0 +1,94 @@ +from typing import ( + Iterator, + Sequence, + final, + overload, +) +import weakref + +import numpy as np + +from pandas._typing import ( + ArrayLike, + Self, + npt, +) + +from pandas import Index +from pandas.core.internals.blocks import Block as B + +def slice_len(slc: slice, objlen: int = ...) -> int: ... +def get_concat_blkno_indexers( + blknos_list: list[npt.NDArray[np.intp]], +) -> list[tuple[npt.NDArray[np.intp], BlockPlacement]]: ... +def get_blkno_indexers( + blknos: np.ndarray, # int64_t[:] + group: bool = ..., +) -> list[tuple[int, slice | np.ndarray]]: ... +def get_blkno_placements( + blknos: np.ndarray, + group: bool = ..., +) -> Iterator[tuple[int, BlockPlacement]]: ... +def update_blklocs_and_blknos( + blklocs: npt.NDArray[np.intp], + blknos: npt.NDArray[np.intp], + loc: int, + nblocks: int, +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +@final +class BlockPlacement: + def __init__(self, val: int | slice | np.ndarray) -> None: ... + @property + def indexer(self) -> np.ndarray | slice: ... + @property + def as_array(self) -> np.ndarray: ... + @property + def as_slice(self) -> slice: ... + @property + def is_slice_like(self) -> bool: ... + @overload + def __getitem__( + self, loc: slice | Sequence[int] | npt.NDArray[np.intp] + ) -> BlockPlacement: ... + @overload + def __getitem__(self, loc: int) -> int: ... + def __iter__(self) -> Iterator[int]: ... + def __len__(self) -> int: ... + def delete(self, loc) -> BlockPlacement: ... + def add(self, other) -> BlockPlacement: ... + def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... + def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ... + +class Block: + _mgr_locs: BlockPlacement + ndim: int + values: ArrayLike + refs: BlockValuesRefs + def __init__( + self, + values: ArrayLike, + placement: BlockPlacement, + ndim: int, + refs: BlockValuesRefs | None = ..., + ) -> None: ... + def slice_block_rows(self, slicer: slice) -> Self: ... + +class BlockManager: + blocks: tuple[B, ...] + axes: list[Index] + _known_consolidated: bool + _is_consolidated: bool + _blknos: np.ndarray + _blklocs: np.ndarray + def __init__( + self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=... + ) -> None: ... + def get_slice(self, slobj: slice, axis: int = ...) -> Self: ... + def _rebuild_blknos_and_blklocs(self) -> None: ... + +class BlockValuesRefs: + referenced_blocks: list[weakref.ref] + def __init__(self, blk: Block | None = ...) -> None: ... + def add_reference(self, blk: Block) -> None: ... + def add_index_reference(self, index: Index) -> None: ... + def has_reference(self) -> bool: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/interval.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/interval.pyi new file mode 100644 index 0000000000000000000000000000000000000000..587fdf84f2f85520713352bbcab29804c95621e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/interval.pyi @@ -0,0 +1,174 @@ +from typing import ( + Any, + Generic, + TypeVar, + overload, +) + +import numpy as np +import numpy.typing as npt + +from pandas._typing import ( + IntervalClosedType, + Timedelta, + Timestamp, +) + +VALID_CLOSED: frozenset[str] + +_OrderableScalarT = TypeVar("_OrderableScalarT", int, float) +_OrderableTimesT = TypeVar("_OrderableTimesT", Timestamp, Timedelta) +_OrderableT = TypeVar("_OrderableT", int, float, Timestamp, Timedelta) + +class _LengthDescriptor: + @overload + def __get__( + self, instance: Interval[_OrderableScalarT], owner: Any + ) -> _OrderableScalarT: ... + @overload + def __get__( + self, instance: Interval[_OrderableTimesT], owner: Any + ) -> Timedelta: ... + +class _MidDescriptor: + @overload + def __get__(self, instance: Interval[_OrderableScalarT], owner: Any) -> float: ... + @overload + def __get__( + self, instance: Interval[_OrderableTimesT], owner: Any + ) -> _OrderableTimesT: ... + +class IntervalMixin: + @property + def closed_left(self) -> bool: ... + @property + def closed_right(self) -> bool: ... + @property + def open_left(self) -> bool: ... + @property + def open_right(self) -> bool: ... + @property + def is_empty(self) -> bool: ... + def _check_closed_matches(self, other: IntervalMixin, name: str = ...) -> None: ... + +class Interval(IntervalMixin, Generic[_OrderableT]): + @property + def left(self: Interval[_OrderableT]) -> _OrderableT: ... + @property + def right(self: Interval[_OrderableT]) -> _OrderableT: ... + @property + def closed(self) -> IntervalClosedType: ... + mid: _MidDescriptor + length: _LengthDescriptor + def __init__( + self, + left: _OrderableT, + right: _OrderableT, + closed: IntervalClosedType = ..., + ) -> None: ... + def __hash__(self) -> int: ... + @overload + def __contains__( + self: Interval[Timedelta], key: Timedelta | Interval[Timedelta] + ) -> bool: ... + @overload + def __contains__( + self: Interval[Timestamp], key: Timestamp | Interval[Timestamp] + ) -> bool: ... + @overload + def __contains__( + self: Interval[_OrderableScalarT], + key: _OrderableScalarT | Interval[_OrderableScalarT], + ) -> bool: ... + @overload + def __add__( + self: Interval[_OrderableTimesT], y: Timedelta + ) -> Interval[_OrderableTimesT]: ... + @overload + def __add__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __add__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __radd__( + self: Interval[_OrderableTimesT], y: Timedelta + ) -> Interval[_OrderableTimesT]: ... + @overload + def __radd__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __radd__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __sub__( + self: Interval[_OrderableTimesT], y: Timedelta + ) -> Interval[_OrderableTimesT]: ... + @overload + def __sub__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __sub__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __rsub__( + self: Interval[_OrderableTimesT], y: Timedelta + ) -> Interval[_OrderableTimesT]: ... + @overload + def __rsub__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __rsub__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __mul__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __mul__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __rmul__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __rmul__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __truediv__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __truediv__(self: Interval[float], y: float) -> Interval[float]: ... + @overload + def __floordiv__( + self: Interval[int], y: _OrderableScalarT + ) -> Interval[_OrderableScalarT]: ... + @overload + def __floordiv__(self: Interval[float], y: float) -> Interval[float]: ... + def overlaps(self: Interval[_OrderableT], other: Interval[_OrderableT]) -> bool: ... + +def intervals_to_interval_bounds( + intervals: np.ndarray, validate_closed: bool = ... +) -> tuple[np.ndarray, np.ndarray, IntervalClosedType]: ... + +class IntervalTree(IntervalMixin): + def __init__( + self, + left: np.ndarray, + right: np.ndarray, + closed: IntervalClosedType = ..., + leaf_size: int = ..., + ) -> None: ... + @property + def mid(self) -> np.ndarray: ... + @property + def length(self) -> np.ndarray: ... + def get_indexer(self, target) -> npt.NDArray[np.intp]: ... + def get_indexer_non_unique( + self, target + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + _na_count: int + @property + def is_overlapping(self) -> bool: ... + @property + def is_monotonic_increasing(self) -> bool: ... + def clear_mapping(self) -> None: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/join.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/join.pyi new file mode 100644 index 0000000000000000000000000000000000000000..1d4e8c90bc5593eae650319e9ca1b58cbd7eed73 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/join.pyi @@ -0,0 +1,79 @@ +import numpy as np + +from pandas._typing import npt + +def inner_join( + left: np.ndarray, # const intp_t[:] + right: np.ndarray, # const intp_t[:] + max_groups: int, + sort: bool = ..., +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +def left_outer_join( + left: np.ndarray, # const intp_t[:] + right: np.ndarray, # const intp_t[:] + max_groups: int, + sort: bool = ..., +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +def full_outer_join( + left: np.ndarray, # const intp_t[:] + right: np.ndarray, # const intp_t[:] + max_groups: int, +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +def ffill_indexer( + indexer: np.ndarray, # const intp_t[:] +) -> npt.NDArray[np.intp]: ... +def left_join_indexer_unique( + left: np.ndarray, # ndarray[join_t] + right: np.ndarray, # ndarray[join_t] +) -> npt.NDArray[np.intp]: ... +def left_join_indexer( + left: np.ndarray, # ndarray[join_t] + right: np.ndarray, # ndarray[join_t] +) -> tuple[ + np.ndarray, # np.ndarray[join_t] + npt.NDArray[np.intp], + npt.NDArray[np.intp], +]: ... +def inner_join_indexer( + left: np.ndarray, # ndarray[join_t] + right: np.ndarray, # ndarray[join_t] +) -> tuple[ + np.ndarray, # np.ndarray[join_t] + npt.NDArray[np.intp], + npt.NDArray[np.intp], +]: ... +def outer_join_indexer( + left: np.ndarray, # ndarray[join_t] + right: np.ndarray, # ndarray[join_t] +) -> tuple[ + np.ndarray, # np.ndarray[join_t] + npt.NDArray[np.intp], + npt.NDArray[np.intp], +]: ... +def asof_join_backward_on_X_by_Y( + left_values: np.ndarray, # ndarray[numeric_t] + right_values: np.ndarray, # ndarray[numeric_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] + allow_exact_matches: bool = ..., + tolerance: np.number | float | None = ..., + use_hashtable: bool = ..., +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +def asof_join_forward_on_X_by_Y( + left_values: np.ndarray, # ndarray[numeric_t] + right_values: np.ndarray, # ndarray[numeric_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] + allow_exact_matches: bool = ..., + tolerance: np.number | float | None = ..., + use_hashtable: bool = ..., +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +def asof_join_nearest_on_X_by_Y( + left_values: np.ndarray, # ndarray[numeric_t] + right_values: np.ndarray, # ndarray[numeric_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] + allow_exact_matches: bool = ..., + tolerance: np.number | float | None = ..., + use_hashtable: bool = ..., +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..60c6d2ec084814fe1379fd97c3bb76f282382320 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.pyi new file mode 100644 index 0000000000000000000000000000000000000000..bc4fe68573b94290cfba2e66950c1b6d45ccf0dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/json.pyi @@ -0,0 +1,23 @@ +from typing import ( + Any, + Callable, +) + +def ujson_dumps( + obj: Any, + ensure_ascii: bool = ..., + double_precision: int = ..., + indent: int = ..., + orient: str = ..., + date_unit: str = ..., + iso_dates: bool = ..., + default_handler: None + | Callable[[Any], str | float | bool | list | dict | None] = ..., +) -> str: ... +def ujson_loads( + s: str, + precise_float: bool = ..., + numpy: bool = ..., + dtype: None = ..., + labelled: bool = ..., +) -> Any: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/lib.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/lib.pyi new file mode 100644 index 0000000000000000000000000000000000000000..b9fd970e68f5bd1d635737f65228f4ac7bc43080 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/lib.pyi @@ -0,0 +1,213 @@ +# TODO(npdtypes): Many types specified here can be made more specific/accurate; +# the more specific versions are specified in comments +from decimal import Decimal +from typing import ( + Any, + Callable, + Final, + Generator, + Hashable, + Literal, + TypeAlias, + overload, +) + +import numpy as np + +from pandas._libs.interval import Interval +from pandas._libs.tslibs import Period +from pandas._typing import ( + ArrayLike, + DtypeObj, + TypeGuard, + npt, +) + +# placeholder until we can specify np.ndarray[object, ndim=2] +ndarray_obj_2d = np.ndarray + +from enum import Enum + +class _NoDefault(Enum): + no_default = ... + +no_default: Final = _NoDefault.no_default +NoDefault: TypeAlias = Literal[_NoDefault.no_default] + +i8max: int +u8max: int + +def is_np_dtype(dtype: object, kinds: str | None = ...) -> TypeGuard[np.dtype]: ... +def item_from_zerodim(val: object) -> object: ... +def infer_dtype(value: object, skipna: bool = ...) -> str: ... +def is_iterator(obj: object) -> bool: ... +def is_scalar(val: object) -> bool: ... +def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... +def is_pyarrow_array(obj: object) -> bool: ... +def is_period(val: object) -> TypeGuard[Period]: ... +def is_interval(obj: object) -> TypeGuard[Interval]: ... +def is_decimal(obj: object) -> TypeGuard[Decimal]: ... +def is_complex(obj: object) -> TypeGuard[complex]: ... +def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... +def is_int_or_none(obj) -> bool: ... +def is_float(obj: object) -> TypeGuard[float]: ... +def is_interval_array(values: np.ndarray) -> bool: ... +def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... +def is_timedelta_or_timedelta64_array( + values: np.ndarray, skipna: bool = True +) -> bool: ... +def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... +def is_time_array(values: np.ndarray, skipna: bool = ...): ... +def is_date_array(values: np.ndarray, skipna: bool = ...): ... +def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... +def is_string_array(values: np.ndarray, skipna: bool = ...): ... +def is_float_array(values: np.ndarray): ... +def is_integer_array(values: np.ndarray, skipna: bool = ...): ... +def is_bool_array(values: np.ndarray, skipna: bool = ...): ... +def fast_multiget( + mapping: dict, + keys: np.ndarray, # object[:] + default=..., +) -> np.ndarray: ... +def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... +def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... +def map_infer( + arr: np.ndarray, + f: Callable[[Any], Any], + convert: bool = ..., + ignore_na: bool = ..., +) -> np.ndarray: ... +@overload +def maybe_convert_objects( + objects: npt.NDArray[np.object_], + *, + try_float: bool = ..., + safe: bool = ..., + convert_numeric: bool = ..., + convert_non_numeric: Literal[False] = ..., + convert_to_nullable_dtype: Literal[False] = ..., + dtype_if_all_nat: DtypeObj | None = ..., +) -> npt.NDArray[np.object_ | np.number]: ... +@overload +def maybe_convert_objects( + objects: npt.NDArray[np.object_], + *, + try_float: bool = ..., + safe: bool = ..., + convert_numeric: bool = ..., + convert_non_numeric: bool = ..., + convert_to_nullable_dtype: Literal[True] = ..., + dtype_if_all_nat: DtypeObj | None = ..., +) -> ArrayLike: ... +@overload +def maybe_convert_objects( + objects: npt.NDArray[np.object_], + *, + try_float: bool = ..., + safe: bool = ..., + convert_numeric: bool = ..., + convert_non_numeric: bool = ..., + convert_to_nullable_dtype: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., +) -> ArrayLike: ... +@overload +def maybe_convert_numeric( + values: npt.NDArray[np.object_], + na_values: set, + convert_empty: bool = ..., + coerce_numeric: bool = ..., + convert_to_masked_nullable: Literal[False] = ..., +) -> tuple[np.ndarray, None]: ... +@overload +def maybe_convert_numeric( + values: npt.NDArray[np.object_], + na_values: set, + convert_empty: bool = ..., + coerce_numeric: bool = ..., + *, + convert_to_masked_nullable: Literal[True], +) -> tuple[np.ndarray, np.ndarray]: ... + +# TODO: restrict `arr`? +def ensure_string_array( + arr, + na_value: object = ..., + convert_na_value: bool = ..., + copy: bool = ..., + skipna: bool = ..., +) -> npt.NDArray[np.object_]: ... +def convert_nans_to_NA( + arr: npt.NDArray[np.object_], +) -> npt.NDArray[np.object_]: ... +def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ... + +# TODO: can we be more specific about rows? +def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ... +def tuples_to_object_array( + tuples: npt.NDArray[np.object_], +) -> ndarray_obj_2d: ... + +# TODO: can we be more specific about rows? +def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ... +def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ... +def maybe_booleans_to_slice( + mask: npt.NDArray[np.uint8], +) -> slice | npt.NDArray[np.uint8]: ... +def maybe_indices_to_slice( + indices: npt.NDArray[np.intp], + max_len: int, +) -> slice | npt.NDArray[np.intp]: ... +def is_all_arraylike(obj: list) -> bool: ... + +# ----------------------------------------------------------------- +# Functions which in reality take memoryviews + +def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64 +def map_infer_mask( + arr: np.ndarray, + f: Callable[[Any], Any], + mask: np.ndarray, # const uint8_t[:] + convert: bool = ..., + na_value: Any = ..., + dtype: np.dtype = ..., +) -> np.ndarray: ... +def indices_fast( + index: npt.NDArray[np.intp], + labels: np.ndarray, # const int64_t[:] + keys: list, + sorted_labels: list[npt.NDArray[np.int64]], +) -> dict[Hashable, npt.NDArray[np.intp]]: ... +def generate_slices( + labels: np.ndarray, ngroups: int # const intp_t[:] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... +def count_level_2d( + mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], + labels: np.ndarray, # const intp_t[:] + max_bin: int, +) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] +def get_level_sorter( + codes: np.ndarray, # const int64_t[:] + starts: np.ndarray, # const intp_t[:] +) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] +def generate_bins_dt64( + values: npt.NDArray[np.int64], + binner: np.ndarray, # const int64_t[:] + closed: object = ..., + hasnans: bool = ..., +) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] +def array_equivalent_object( + left: npt.NDArray[np.object_], + right: npt.NDArray[np.object_], +) -> bool: ... +def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] +def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:] +def get_reverse_indexer( + indexer: np.ndarray, # const intp_t[:] + length: int, +) -> npt.NDArray[np.intp]: ... +def is_bool_list(obj: list) -> bool: ... +def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... +def is_range_indexer( + left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1] +) -> bool: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/missing.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/missing.pyi new file mode 100644 index 0000000000000000000000000000000000000000..282dcee3ed6cfdd5cd83e5c78a69c422831b4cac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/missing.pyi @@ -0,0 +1,16 @@ +import numpy as np +from numpy import typing as npt + +class NAType: + def __new__(cls, *args, **kwargs): ... + +NA: NAType + +def is_matching_na( + left: object, right: object, nan_matches_none: bool = ... +) -> bool: ... +def isposinf_scalar(val: object) -> bool: ... +def isneginf_scalar(val: object) -> bool: ... +def checknull(val: object, inf_as_na: bool = ...) -> bool: ... +def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... +def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops.pyi new file mode 100644 index 0000000000000000000000000000000000000000..6738a1dff4a9eb272f3e6c88a31b0bd7386aebd2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops.pyi @@ -0,0 +1,51 @@ +from typing import ( + Any, + Callable, + Iterable, + Literal, + TypeAlias, + overload, +) + +import numpy as np + +from pandas._typing import npt + +_BinOp: TypeAlias = Callable[[Any, Any], Any] +_BoolOp: TypeAlias = Callable[[Any, Any], bool] + +def scalar_compare( + values: np.ndarray, # object[:] + val: object, + op: _BoolOp, # {operator.eq, operator.ne, ...} +) -> npt.NDArray[np.bool_]: ... +def vec_compare( + left: npt.NDArray[np.object_], + right: npt.NDArray[np.object_], + op: _BoolOp, # {operator.eq, operator.ne, ...} +) -> npt.NDArray[np.bool_]: ... +def scalar_binop( + values: np.ndarray, # object[:] + val: object, + op: _BinOp, # binary operator +) -> np.ndarray: ... +def vec_binop( + left: np.ndarray, # object[:] + right: np.ndarray, # object[:] + op: _BinOp, # binary operator +) -> np.ndarray: ... +@overload +def maybe_convert_bool( + arr: npt.NDArray[np.object_], + true_values: Iterable | None = None, + false_values: Iterable | None = None, + convert_to_masked_nullable: Literal[False] = ..., +) -> tuple[np.ndarray, None]: ... +@overload +def maybe_convert_bool( + arr: npt.NDArray[np.object_], + true_values: Iterable = ..., + false_values: Iterable = ..., + *, + convert_to_masked_nullable: Literal[True], +) -> tuple[np.ndarray, np.ndarray]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..8d14a7dcfdddc458b8564a71dfc95dbcb1f0c129 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.pyi new file mode 100644 index 0000000000000000000000000000000000000000..91b5a4dbaaebc177191d3189f12e4e20d56ca0fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/ops_dispatch.pyi @@ -0,0 +1,5 @@ +import numpy as np + +def maybe_dispatch_ufunc_to_dunder_op( + self, ufunc: np.ufunc, method: str, *inputs, **kwargs +): ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..50811df9cde05706c907232ecd9c757c66cb0200 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..d7305a4a04f77c0462678505702a6473e4a5a340 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/parsers.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/parsers.pyi new file mode 100644 index 0000000000000000000000000000000000000000..253bb7303cefb81f61692c8d7bd9812a191d9ac5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/parsers.pyi @@ -0,0 +1,77 @@ +from typing import ( + Hashable, + Literal, +) + +import numpy as np + +from pandas._typing import ( + ArrayLike, + Dtype, + npt, +) + +STR_NA_VALUES: set[str] +DEFAULT_BUFFER_HEURISTIC: int + +def sanitize_objects( + values: npt.NDArray[np.object_], + na_values: set, +) -> int: ... + +class TextReader: + unnamed_cols: set[str] + table_width: int # int64_t + leading_cols: int # int64_t + header: list[list[int]] # non-negative integers + def __init__( + self, + source, + delimiter: bytes | str = ..., # single-character only + header=..., + header_start: int = ..., # int64_t + header_end: int = ..., # uint64_t + index_col=..., + names=..., + tokenize_chunksize: int = ..., # int64_t + delim_whitespace: bool = ..., + converters=..., + skipinitialspace: bool = ..., + escapechar: bytes | str | None = ..., # single-character only + doublequote: bool = ..., + quotechar: str | bytes | None = ..., # at most 1 character + quoting: int = ..., + lineterminator: bytes | str | None = ..., # at most 1 character + comment=..., + decimal: bytes | str = ..., # single-character only + thousands: bytes | str | None = ..., # single-character only + dtype: Dtype | dict[Hashable, Dtype] = ..., + usecols=..., + error_bad_lines: bool = ..., + warn_bad_lines: bool = ..., + na_filter: bool = ..., + na_values=..., + na_fvalues=..., + keep_default_na: bool = ..., + true_values=..., + false_values=..., + allow_leading_cols: bool = ..., + skiprows=..., + skipfooter: int = ..., # int64_t + verbose: bool = ..., + float_precision: Literal["round_trip", "legacy", "high"] | None = ..., + skip_blank_lines: bool = ..., + encoding_errors: bytes | str = ..., + ) -> None: ... + def set_noconvert(self, i: int) -> None: ... + def remove_noconvert(self, i: int) -> None: ... + def close(self) -> None: ... + def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ... + def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ... + +# _maybe_upcast, na_values are only exposed for testing +na_values: dict + +def _maybe_upcast( + arr, use_dtype_backend: bool = ..., dtype_backend: str = ... +) -> np.ndarray: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..5a12850d890f8cf1cce5c01e010d7434ecf346a7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.pyi new file mode 100644 index 0000000000000000000000000000000000000000..aaa44a0cf47bf8635727ea9f354227de72bbff29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/properties.pyi @@ -0,0 +1,27 @@ +from typing import ( + Sequence, + overload, +) + +from pandas._typing import ( + AnyArrayLike, + DataFrame, + Index, + Series, +) + +# note: this is a lie to make type checkers happy (they special +# case property). cache_readonly uses attribute names similar to +# property (fget) but it does not provide fset and fdel. +cache_readonly = property + +class AxisProperty: + axis: int + def __init__(self, axis: int = ..., doc: str = ...) -> None: ... + @overload + def __get__(self, obj: DataFrame | Series, type) -> Index: ... + @overload + def __get__(self, obj: None, type) -> AxisProperty: ... + def __set__( + self, obj: DataFrame | Series, value: AnyArrayLike | Sequence + ) -> None: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/reshape.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/reshape.pyi new file mode 100644 index 0000000000000000000000000000000000000000..110687fcd0c313c45e8b025083fa5790fb9913b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/reshape.pyi @@ -0,0 +1,16 @@ +import numpy as np + +from pandas._typing import npt + +def unstack( + values: np.ndarray, # reshape_t[:, :] + mask: np.ndarray, # const uint8_t[:] + stride: int, + length: int, + width: int, + new_values: np.ndarray, # reshape_t[:, :] + new_mask: np.ndarray, # uint8_t[:, :] +) -> None: ... +def explode( + values: npt.NDArray[np.object_], +) -> tuple[npt.NDArray[np.object_], npt.NDArray[np.int64]]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sas.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sas.pyi new file mode 100644 index 0000000000000000000000000000000000000000..5d65e2b56b5916ed1e76e1409e4f75c652ee8fc9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sas.pyi @@ -0,0 +1,7 @@ +from pandas.io.sas.sas7bdat import SAS7BDATReader + +class Parser: + def __init__(self, parser: SAS7BDATReader) -> None: ... + def read(self, nrows: int) -> None: ... + +def get_subheader_index(signature: bytes) -> int: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sparse.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sparse.pyi new file mode 100644 index 0000000000000000000000000000000000000000..536265b25425e9e7ecabd37f095115442a7cb209 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/sparse.pyi @@ -0,0 +1,51 @@ +from typing import Sequence + +import numpy as np + +from pandas._typing import ( + Self, + npt, +) + +class SparseIndex: + length: int + npoints: int + def __init__(self) -> None: ... + @property + def ngaps(self) -> int: ... + @property + def nbytes(self) -> int: ... + @property + def indices(self) -> npt.NDArray[np.int32]: ... + def equals(self, other) -> bool: ... + def lookup(self, index: int) -> np.int32: ... + def lookup_array(self, indexer: npt.NDArray[np.int32]) -> npt.NDArray[np.int32]: ... + def to_int_index(self) -> IntIndex: ... + def to_block_index(self) -> BlockIndex: ... + def intersect(self, y_: SparseIndex) -> Self: ... + def make_union(self, y_: SparseIndex) -> Self: ... + +class IntIndex(SparseIndex): + indices: npt.NDArray[np.int32] + def __init__( + self, length: int, indices: Sequence[int], check_integrity: bool = ... + ) -> None: ... + +class BlockIndex(SparseIndex): + nblocks: int + blocs: np.ndarray + blengths: np.ndarray + def __init__( + self, length: int, blocs: np.ndarray, blengths: np.ndarray + ) -> None: ... + + # Override to have correct parameters + def intersect(self, other: SparseIndex) -> Self: ... + def make_union(self, y: SparseIndex) -> Self: ... + +def make_mask_object_ndarray( + arr: npt.NDArray[np.object_], fill_value +) -> npt.NDArray[np.bool_]: ... +def get_blocks( + indices: npt.NDArray[np.int32], +) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/testing.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/testing.pyi new file mode 100644 index 0000000000000000000000000000000000000000..01da496975f512b204defb06fcd19a36e235f97a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/testing.pyi @@ -0,0 +1,12 @@ +def assert_dict_equal(a, b, compare_keys: bool = ...): ... +def assert_almost_equal( + a, + b, + rtol: float = ..., + atol: float = ..., + check_dtype: bool = ..., + obj=..., + lobj=..., + robj=..., + index_values=..., +): ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslib.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslib.pyi new file mode 100644 index 0000000000000000000000000000000000000000..5a340c1d88bc439663c2e482c9eb7270f8ebc5c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslib.pyi @@ -0,0 +1,37 @@ +from datetime import tzinfo + +import numpy as np + +from pandas._typing import npt + +def format_array_from_datetime( + values: npt.NDArray[np.int64], + tz: tzinfo | None = ..., + format: str | None = ..., + na_rep: str | float = ..., + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.object_]: ... +def array_with_unit_to_datetime( + values: npt.NDArray[np.object_], + unit: str, + errors: str = ..., +) -> tuple[np.ndarray, tzinfo | None]: ... +def first_non_null(values: np.ndarray) -> int: ... +def array_to_datetime( + values: npt.NDArray[np.object_], + errors: str = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: bool = ..., + creso: int = ..., +) -> tuple[np.ndarray, tzinfo | None]: ... + +# returned ndarray may be object dtype or datetime64[ns] + +def array_to_datetime_with_tz( + values: npt.NDArray[np.object_], + tz: tzinfo, + dayfirst: bool, + yearfirst: bool, + creso: int, +) -> npt.NDArray[np.int64]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/writers.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/writers.pyi new file mode 100644 index 0000000000000000000000000000000000000000..7b41856525dadf79a2bf4b29c7ddebfedaa880db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/writers.pyi @@ -0,0 +1,20 @@ +import numpy as np + +from pandas._typing import ArrayLike + +def write_csv_rows( + data: list[ArrayLike], + data_index: np.ndarray, + nlevels: int, + cols: np.ndarray, + writer: object, # _csv.writer +) -> None: ... +def convert_json_to_lines(arr: str) -> str: ... +def max_len_string_array( + arr: np.ndarray, # pandas_string[:] +) -> int: ... +def word_len(val: object) -> int: ... +def string_array_replace_from_nan_rep( + arr: np.ndarray, # np.ndarray[object, ndim=1] + nan_rep: object, +) -> None: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0d42b6541fdf8817b996ef9804db8a87a2bcd2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__init__.py @@ -0,0 +1,16 @@ +""" public toolkit API """ +from pandas.api import ( + extensions, + indexers, + interchange, + types, + typing, +) + +__all__ = [ + "interchange", + "extensions", + "indexers", + "types", + "typing", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ada6d705172feeeea4e7c8fbe0b33f8ab495223 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__init__.py @@ -0,0 +1,199 @@ +""" +compat +====== + +Cross-compatible functions for different versions of Python. + +Other items: +* platform checker +""" +from __future__ import annotations + +import os +import platform +import sys +from typing import TYPE_CHECKING + +from pandas.compat._constants import ( + IS64, + ISMUSL, + PY310, + PY311, + PY312, + PYPY, +) +import pandas.compat.compressors +from pandas.compat.numpy import is_numpy_dev +from pandas.compat.pyarrow import ( + pa_version_under10p1, + pa_version_under11p0, + pa_version_under13p0, + pa_version_under14p0, + pa_version_under14p1, + pa_version_under16p0, + pa_version_under17p0, +) + +if TYPE_CHECKING: + from pandas._typing import F + + +def set_function_name(f: F, name: str, cls: type) -> F: + """ + Bind the name/qualname attributes of the function. + """ + f.__name__ = name + f.__qualname__ = f"{cls.__name__}.{name}" + f.__module__ = cls.__module__ + return f + + +def is_platform_little_endian() -> bool: + """ + Checking if the running platform is little endian. + + Returns + ------- + bool + True if the running platform is little endian. + """ + return sys.byteorder == "little" + + +def is_platform_windows() -> bool: + """ + Checking if the running platform is windows. + + Returns + ------- + bool + True if the running platform is windows. + """ + return sys.platform in ["win32", "cygwin"] + + +def is_platform_linux() -> bool: + """ + Checking if the running platform is linux. + + Returns + ------- + bool + True if the running platform is linux. + """ + return sys.platform == "linux" + + +def is_platform_mac() -> bool: + """ + Checking if the running platform is mac. + + Returns + ------- + bool + True if the running platform is mac. + """ + return sys.platform == "darwin" + + +def is_platform_arm() -> bool: + """ + Checking if the running platform use ARM architecture. + + Returns + ------- + bool + True if the running platform uses ARM architecture. + """ + return platform.machine() in ("arm64", "aarch64") or platform.machine().startswith( + "armv" + ) + + +def is_platform_power() -> bool: + """ + Checking if the running platform use Power architecture. + + Returns + ------- + bool + True if the running platform uses ARM architecture. + """ + return platform.machine() in ("ppc64", "ppc64le") + + +def is_ci_environment() -> bool: + """ + Checking if running in a continuous integration environment by checking + the PANDAS_CI environment variable. + + Returns + ------- + bool + True if the running in a continuous integration environment. + """ + return os.environ.get("PANDAS_CI", "0") == "1" + + +def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: + """ + Importing the `LZMAFile` class from the `lzma` module. + + Returns + ------- + class + The `LZMAFile` class from the `lzma` module. + + Raises + ------ + RuntimeError + If the `lzma` module was not imported correctly, or didn't exist. + """ + if not pandas.compat.compressors.has_lzma: + raise RuntimeError( + "lzma module not available. " + "A Python re-install with the proper dependencies, " + "might be required to solve this issue." + ) + return pandas.compat.compressors.LZMAFile + + +def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: + """ + Importing the `BZ2File` class from the `bz2` module. + + Returns + ------- + class + The `BZ2File` class from the `bz2` module. + + Raises + ------ + RuntimeError + If the `bz2` module was not imported correctly, or didn't exist. + """ + if not pandas.compat.compressors.has_bz2: + raise RuntimeError( + "bz2 module not available. " + "A Python re-install with the proper dependencies, " + "might be required to solve this issue." + ) + return pandas.compat.compressors.BZ2File + + +__all__ = [ + "is_numpy_dev", + "pa_version_under10p1", + "pa_version_under11p0", + "pa_version_under13p0", + "pa_version_under14p0", + "pa_version_under14p1", + "pa_version_under16p0", + "pa_version_under17p0", + "IS64", + "ISMUSL", + "PY310", + "PY311", + "PY312", + "PYPY", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_constants.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc3fbaaefebf69d8ebd622406dc9357237add1a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_constants.py @@ -0,0 +1,30 @@ +""" +_constants +====== + +Constants relevant for the Python implementation. +""" + +from __future__ import annotations + +import platform +import sys +import sysconfig + +IS64 = sys.maxsize > 2**32 + +PY310 = sys.version_info >= (3, 10) +PY311 = sys.version_info >= (3, 11) +PY312 = sys.version_info >= (3, 12) +PYPY = platform.python_implementation() == "PyPy" +ISMUSL = "musl" in (sysconfig.get_config_var("HOST_GNU_TYPE") or "") +REF_COUNT = 2 if PY311 else 3 + +__all__ = [ + "IS64", + "ISMUSL", + "PY310", + "PY311", + "PY312", + "PYPY", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_optional.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_optional.py new file mode 100644 index 0000000000000000000000000000000000000000..2bc6cd46f09a7e4b103658f9c2ec9a69d93d00b9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/_optional.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import importlib +import sys +from typing import TYPE_CHECKING +import warnings + +from pandas.util._exceptions import find_stack_level + +from pandas.util.version import Version + +if TYPE_CHECKING: + import types + +# Update install.rst & setup.cfg when updating versions! + +VERSIONS = { + "adbc-driver-postgresql": "0.8.0", + "adbc-driver-sqlite": "0.8.0", + "bs4": "4.11.2", + "blosc": "1.21.3", + "bottleneck": "1.3.6", + "dataframe-api-compat": "0.1.7", + "fastparquet": "2022.12.0", + "fsspec": "2022.11.0", + "html5lib": "1.1", + "hypothesis": "6.46.1", + "gcsfs": "2022.11.0", + "jinja2": "3.1.2", + "lxml.etree": "4.9.2", + "matplotlib": "3.6.3", + "numba": "0.56.4", + "numexpr": "2.8.4", + "odfpy": "1.4.1", + "openpyxl": "3.1.0", + "pandas_gbq": "0.19.0", + "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) + "pymysql": "1.0.2", + "pyarrow": "10.0.1", + "pyreadstat": "1.2.0", + "pytest": "7.3.2", + "python-calamine": "0.1.7", + "pyxlsb": "1.0.10", + "s3fs": "2022.11.0", + "scipy": "1.10.0", + "sqlalchemy": "2.0.0", + "tables": "3.8.0", + "tabulate": "0.9.0", + "xarray": "2022.12.0", + "xlrd": "2.0.1", + "xlsxwriter": "3.0.5", + "zstandard": "0.19.0", + "tzdata": "2022.7", + "qtpy": "2.3.0", + "pyqt5": "5.15.9", +} + +# A mapping from import name to package name (on PyPI) for packages where +# these two names are different. + +INSTALL_MAPPING = { + "bs4": "beautifulsoup4", + "bottleneck": "Bottleneck", + "jinja2": "Jinja2", + "lxml.etree": "lxml", + "odf": "odfpy", + "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", + "sqlalchemy": "SQLAlchemy", + "tables": "pytables", +} + + +def get_version(module: types.ModuleType) -> str: + version = getattr(module, "__version__", None) + + if version is None: + raise ImportError(f"Can't determine version for {module.__name__}") + if module.__name__ == "psycopg2": + # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version + version = version.split()[0] + return version + + +def import_optional_dependency( + name: str, + extra: str = "", + errors: str = "raise", + min_version: str | None = None, +): + """ + Import an optional dependency. + + By default, if a dependency is missing an ImportError with a nice + message will be raised. If a dependency is present, but too old, + we raise. + + Parameters + ---------- + name : str + The module name. + extra : str + Additional text to include in the ImportError message. + errors : str {'raise', 'warn', 'ignore'} + What to do when a dependency is not found or its version is too old. + + * raise : Raise an ImportError + * warn : Only applicable when a module's version is to old. + Warns that the version is too old and returns None + * ignore: If the module is not installed, return None, otherwise, + return the module, even if the version is too old. + It's expected that users validate the version locally when + using ``errors="ignore"`` (see. ``io/html.py``) + min_version : str, default None + Specify a minimum version that is different from the global pandas + minimum version required. + Returns + ------- + maybe_module : Optional[ModuleType] + The imported module, when found and the version is correct. + None is returned when the package is not found and `errors` + is False, or when the package's version is too old and `errors` + is ``'warn'`` or ``'ignore'``. + """ + assert errors in {"warn", "raise", "ignore"} + + package_name = INSTALL_MAPPING.get(name) + install_name = package_name if package_name is not None else name + + msg = ( + f"Missing optional dependency '{install_name}'. {extra} " + f"Use pip or conda to install {install_name}." + ) + try: + module = importlib.import_module(name) + except ImportError: + if errors == "raise": + raise ImportError(msg) + return None + + # Handle submodules: if we have submodule, grab parent module from sys.modules + parent = name.split(".")[0] + if parent != name: + install_name = parent + module_to_get = sys.modules[install_name] + else: + module_to_get = module + minimum_version = min_version if min_version is not None else VERSIONS.get(parent) + if minimum_version: + version = get_version(module_to_get) + if version and Version(version) < Version(minimum_version): + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{parent}' " + f"(version '{version}' currently installed)." + ) + if errors == "warn": + warnings.warn( + msg, + UserWarning, + stacklevel=find_stack_level(), + ) + return None + elif errors == "raise": + raise ImportError(msg) + else: + return None + + return module diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/compressors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/compressors.py new file mode 100644 index 0000000000000000000000000000000000000000..1f31e34c092c9672559ca2f5194cb1da7083d03b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/compressors.py @@ -0,0 +1,77 @@ +""" +Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. +""" + +from __future__ import annotations + +from pickle import PickleBuffer + +from pandas.compat._constants import PY310 + +try: + import bz2 + + has_bz2 = True +except ImportError: + has_bz2 = False + +try: + import lzma + + has_lzma = True +except ImportError: + has_lzma = False + + +def flatten_buffer( + b: bytes | bytearray | memoryview | PickleBuffer, +) -> bytes | bytearray | memoryview: + """ + Return some 1-D `uint8` typed buffer. + + Coerces anything that does not match that description to one that does + without copying if possible (otherwise will copy). + """ + + if isinstance(b, (bytes, bytearray)): + return b + + if not isinstance(b, PickleBuffer): + b = PickleBuffer(b) + + try: + # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy + return b.raw() + except BufferError: + # perform in-memory copy if buffer is not contiguous + return memoryview(b).tobytes("A") + + +if has_bz2: + + class BZ2File(bz2.BZ2File): + if not PY310: + + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) + + +if has_lzma: + + class LZMAFile(lzma.LZMAFile): + if not PY310: + + def write(self, b) -> int: + # Workaround issue where `lzma.LZMAFile` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pickle_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pickle_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..cd98087c06c18634304c29d88837017a6952a4fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pickle_compat.py @@ -0,0 +1,262 @@ +""" +Support pre-0.12 series pickle compatibility. +""" +from __future__ import annotations + +import contextlib +import copy +import io +import pickle as pkl +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.arrays import NDArrayBacked +from pandas._libs.tslibs import BaseOffset + +from pandas import Index +from pandas.core.arrays import ( + DatetimeArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.internals import BlockManager + +if TYPE_CHECKING: + from collections.abc import Generator + + +def load_reduce(self) -> None: + stack = self.stack + args = stack.pop() + func = stack[-1] + + try: + stack[-1] = func(*args) + return + except TypeError as err: + # If we have a deprecated function, + # try to replace and try again. + + msg = "_reconstruct: First argument must be a sub-type of ndarray" + + if msg in str(err): + try: + cls = args[0] + stack[-1] = object.__new__(cls) + return + except TypeError: + pass + elif args and isinstance(args[0], type) and issubclass(args[0], BaseOffset): + # TypeError: object.__new__(Day) is not safe, use Day.__new__() + cls = args[0] + stack[-1] = cls.__new__(*args) + return + elif args and issubclass(args[0], PeriodArray): + cls = args[0] + stack[-1] = NDArrayBacked.__new__(*args) + return + + raise + + +# If classes are moved, provide compat here. +_class_locations_map = { + ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), + # 15477 + ("pandas.core.base", "FrozenNDArray"): ("numpy", "ndarray"), + # Re-routing unpickle block logic to go through _unpickle_block instead + # for pandas <= 1.3.5 + ("pandas.core.internals.blocks", "new_block"): ( + "pandas._libs.internals", + "_unpickle_block", + ), + ("pandas.core.indexes.frozen", "FrozenNDArray"): ("numpy", "ndarray"), + ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), + # 10890 + ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), + ("pandas.sparse.series", "SparseTimeSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), + # 12588, extensions moving + ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"), + ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"), + # 18543 moving period + ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"), + ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"), + # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype + ("pandas.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + ("pandas._libs.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + # 15998 top-level dirs moving + ("pandas.sparse.array", "SparseArray"): ( + "pandas.core.arrays.sparse", + "SparseArray", + ), + ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), + ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), + ("pandas.indexes.numeric", "Int64Index"): ( + "pandas.core.indexes.base", + "Index", # updated in 50775 + ), + ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"), + ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"), + ("pandas.tseries.index", "_new_DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "_new_DatetimeIndex", + ), + ("pandas.tseries.index", "DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "DatetimeIndex", + ), + ("pandas.tseries.period", "PeriodIndex"): ( + "pandas.core.indexes.period", + "PeriodIndex", + ), + # 19269, arrays moving + ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"), + # 19939, add timedeltaindex, float64index compat from 15998 move + ("pandas.tseries.tdi", "TimedeltaIndex"): ( + "pandas.core.indexes.timedeltas", + "TimedeltaIndex", + ), + ("pandas.indexes.numeric", "Float64Index"): ( + "pandas.core.indexes.base", + "Index", # updated in 50775 + ), + # 50775, remove Int64Index, UInt64Index & Float64Index from codabase + ("pandas.core.indexes.numeric", "Int64Index"): ( + "pandas.core.indexes.base", + "Index", + ), + ("pandas.core.indexes.numeric", "UInt64Index"): ( + "pandas.core.indexes.base", + "Index", + ), + ("pandas.core.indexes.numeric", "Float64Index"): ( + "pandas.core.indexes.base", + "Index", + ), + ("pandas.core.arrays.sparse.dtype", "SparseDtype"): ( + "pandas.core.dtypes.dtypes", + "SparseDtype", + ), +} + + +# our Unpickler sub-class to override methods and some dispatcher +# functions for compat and uses a non-public class of the pickle module. + + +class Unpickler(pkl._Unpickler): + def find_class(self, module, name): + # override superclass + key = (module, name) + module, name = _class_locations_map.get(key, key) + return super().find_class(module, name) + + +Unpickler.dispatch = copy.copy(Unpickler.dispatch) +Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce + + +def load_newobj(self) -> None: + args = self.stack.pop() + cls = self.stack[-1] + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + elif issubclass(cls, DatetimeArray) and not args: + arr = np.array([], dtype="M8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif issubclass(cls, TimedeltaArray) and not args: + arr = np.array([], dtype="m8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif cls is BlockManager and not args: + obj = cls.__new__(cls, (), [], False) + else: + obj = cls.__new__(cls, *args) + + self.stack[-1] = obj + + +Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj + + +def load_newobj_ex(self) -> None: + kwargs = self.stack.pop() + args = self.stack.pop() + cls = self.stack.pop() + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + else: + obj = cls.__new__(cls, *args, **kwargs) + self.append(obj) + + +try: + Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex +except (AttributeError, KeyError): + pass + + +def load(fh, encoding: str | None = None, is_verbose: bool = False): + """ + Load a pickle, with a provided encoding, + + Parameters + ---------- + fh : a filelike object + encoding : an optional encoding + is_verbose : show exception output + """ + try: + fh.seek(0) + if encoding is not None: + up = Unpickler(fh, encoding=encoding) + else: + up = Unpickler(fh) + # "Unpickler" has no attribute "is_verbose" [attr-defined] + up.is_verbose = is_verbose # type: ignore[attr-defined] + + return up.load() + except (ValueError, TypeError): + raise + + +def loads( + bytes_object: bytes, + *, + fix_imports: bool = True, + encoding: str = "ASCII", + errors: str = "strict", +): + """ + Analogous to pickle._loads. + """ + fd = io.BytesIO(bytes_object) + return Unpickler( + fd, fix_imports=fix_imports, encoding=encoding, errors=errors + ).load() + + +@contextlib.contextmanager +def patch_pickle() -> Generator[None, None, None]: + """ + Temporarily patch pickle to use our unpickler. + """ + orig_loads = pkl.loads + try: + setattr(pkl, "loads", loads) + yield + finally: + setattr(pkl, "loads", orig_loads) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pyarrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pyarrow.py new file mode 100644 index 0000000000000000000000000000000000000000..457d26766520d6fe081b5b90145fcd832baee6a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/pyarrow.py @@ -0,0 +1,29 @@ +""" support pyarrow compatibility across versions """ + +from __future__ import annotations + +from pandas.util.version import Version + +try: + import pyarrow as pa + + _palv = Version(Version(pa.__version__).base_version) + pa_version_under10p1 = _palv < Version("10.0.1") + pa_version_under11p0 = _palv < Version("11.0.0") + pa_version_under12p0 = _palv < Version("12.0.0") + pa_version_under13p0 = _palv < Version("13.0.0") + pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") + pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") +except ImportError: + pa_version_under10p1 = True + pa_version_under11p0 = True + pa_version_under12p0 = True + pa_version_under13p0 = True + pa_version_under14p0 = True + pa_version_under14p1 = True + pa_version_under15p0 = True + pa_version_under16p0 = True + pa_version_under17p0 = True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/accessor.py new file mode 100644 index 0000000000000000000000000000000000000000..698abb2ec4989b2f95f6b2c012f03873bba328cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/accessor.py @@ -0,0 +1,340 @@ +""" + +accessor.py contains base classes for implementing accessor properties +that can be mixed into or pinned onto other pandas classes. + +""" +from __future__ import annotations + +from typing import ( + Callable, + final, +) +import warnings + +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level + + +class DirNamesMixin: + _accessors: set[str] = set() + _hidden_attrs: frozenset[str] = frozenset() + + @final + def _dir_deletions(self) -> set[str]: + """ + Delete unwanted __dir__ for this object. + """ + return self._accessors | self._hidden_attrs + + def _dir_additions(self) -> set[str]: + """ + Add additional __dir__ for this object. + """ + return {accessor for accessor in self._accessors if hasattr(self, accessor)} + + def __dir__(self) -> list[str]: + """ + Provide method name lookup and completion. + + Notes + ----- + Only provide 'public' methods. + """ + rv = set(super().__dir__()) + rv = (rv - self._dir_deletions()) | self._dir_additions() + return sorted(rv) + + +class PandasDelegate: + """ + Abstract base class for delegating methods/properties. + """ + + def _delegate_property_get(self, name: str, *args, **kwargs): + raise TypeError(f"You cannot access the property {name}") + + def _delegate_property_set(self, name: str, value, *args, **kwargs): + raise TypeError(f"The property {name} cannot be set") + + def _delegate_method(self, name: str, *args, **kwargs): + raise TypeError(f"You cannot call method {name}") + + @classmethod + def _add_delegate_accessors( + cls, + delegate, + accessors: list[str], + typ: str, + overwrite: bool = False, + accessor_mapping: Callable[[str], str] = lambda x: x, + raise_on_missing: bool = True, + ) -> None: + """ + Add accessors to cls from the delegate class. + + Parameters + ---------- + cls + Class to add the methods/properties to. + delegate + Class to get methods/properties and doc-strings. + accessors : list of str + List of accessors to add. + typ : {'property', 'method'} + overwrite : bool, default False + Overwrite the method/property in the target class if it exists. + accessor_mapping: Callable, default lambda x: x + Callable to map the delegate's function to the cls' function. + raise_on_missing: bool, default True + Raise if an accessor does not exist on delegate. + False skips the missing accessor. + """ + + def _create_delegator_property(name: str): + def _getter(self): + return self._delegate_property_get(name) + + def _setter(self, new_values): + return self._delegate_property_set(name, new_values) + + _getter.__name__ = name + _setter.__name__ = name + + return property( + fget=_getter, + fset=_setter, + doc=getattr(delegate, accessor_mapping(name)).__doc__, + ) + + def _create_delegator_method(name: str): + def f(self, *args, **kwargs): + return self._delegate_method(name, *args, **kwargs) + + f.__name__ = name + f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__ + + return f + + for name in accessors: + if ( + not raise_on_missing + and getattr(delegate, accessor_mapping(name), None) is None + ): + continue + + if typ == "property": + f = _create_delegator_property(name) + else: + f = _create_delegator_method(name) + + # don't overwrite existing methods/properties + if overwrite or not hasattr(cls, name): + setattr(cls, name, f) + + +def delegate_names( + delegate, + accessors: list[str], + typ: str, + overwrite: bool = False, + accessor_mapping: Callable[[str], str] = lambda x: x, + raise_on_missing: bool = True, +): + """ + Add delegated names to a class using a class decorator. This provides + an alternative usage to directly calling `_add_delegate_accessors` + below a class definition. + + Parameters + ---------- + delegate : object + The class to get methods/properties & doc-strings. + accessors : Sequence[str] + List of accessor to add. + typ : {'property', 'method'} + overwrite : bool, default False + Overwrite the method/property in the target class if it exists. + accessor_mapping: Callable, default lambda x: x + Callable to map the delegate's function to the cls' function. + raise_on_missing: bool, default True + Raise if an accessor does not exist on delegate. + False skips the missing accessor. + + Returns + ------- + callable + A class decorator. + + Examples + -------- + @delegate_names(Categorical, ["categories", "ordered"], "property") + class CategoricalAccessor(PandasDelegate): + [...] + """ + + def add_delegate_accessors(cls): + cls._add_delegate_accessors( + delegate, + accessors, + typ, + overwrite=overwrite, + accessor_mapping=accessor_mapping, + raise_on_missing=raise_on_missing, + ) + return cls + + return add_delegate_accessors + + +# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE +# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py +# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors +# 2. We use a UserWarning instead of a custom Warning + + +class CachedAccessor: + """ + Custom property-like object. + + A descriptor for caching accessors. + + Parameters + ---------- + name : str + Namespace that will be accessed under, e.g. ``df.foo``. + accessor : cls + Class with the extension methods. + + Notes + ----- + For accessor, The class's __init__ method assumes that one of + ``Series``, ``DataFrame`` or ``Index`` as the + single argument ``data``. + """ + + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + # we're accessing the attribute of the class, i.e., Dataset.geo + return self._accessor + accessor_obj = self._accessor(obj) + # Replace the property with the accessor object. Inspired by: + # https://www.pydanny.com/cached-property.html + # We need to use object.__setattr__ because we overwrite __setattr__ on + # NDFrame + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj + + +@doc(klass="", others="") +def _register_accessor(name: str, cls): + """ + Register a custom accessor on {klass} objects. + + Parameters + ---------- + name : str + Name under which the accessor should be registered. A warning is issued + if this name conflicts with a preexisting attribute. + + Returns + ------- + callable + A class decorator. + + See Also + -------- + register_dataframe_accessor : Register a custom accessor on DataFrame objects. + register_series_accessor : Register a custom accessor on Series objects. + register_index_accessor : Register a custom accessor on Index objects. + + Notes + ----- + When accessed, your accessor will be initialized with the pandas object + the user is interacting with. So the signature must be + + .. code-block:: python + + def __init__(self, pandas_object): # noqa: E999 + ... + + For consistency with pandas methods, you should raise an ``AttributeError`` + if the data passed to your accessor has an incorrect dtype. + + >>> pd.Series(['a', 'b']).dt + Traceback (most recent call last): + ... + AttributeError: Can only use .dt accessor with datetimelike values + + Examples + -------- + In your library code:: + + import pandas as pd + + @pd.api.extensions.register_dataframe_accessor("geo") + class GeoAccessor: + def __init__(self, pandas_obj): + self._obj = pandas_obj + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass + + Back in an interactive IPython session: + + .. code-block:: ipython + + In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), + ...: "latitude": np.linspace(0, 20)}}) + In [2]: ds.geo.center + Out[2]: (5.0, 10.0) + In [3]: ds.geo.plot() # plots data on a map + """ + + def decorator(accessor): + if hasattr(cls, name): + warnings.warn( + f"registration of accessor {repr(accessor)} under name " + f"{repr(name)} for type {repr(cls)} is overriding a preexisting " + f"attribute with the same name.", + UserWarning, + stacklevel=find_stack_level(), + ) + setattr(cls, name, CachedAccessor(name, accessor)) + cls._accessors.add(name) + return accessor + + return decorator + + +@doc(_register_accessor, klass="DataFrame") +def register_dataframe_accessor(name: str): + from pandas import DataFrame + + return _register_accessor(name, DataFrame) + + +@doc(_register_accessor, klass="Series") +def register_series_accessor(name: str): + from pandas import Series + + return _register_accessor(name, Series) + + +@doc(_register_accessor, klass="Index") +def register_index_accessor(name: str): + from pandas import Index + + return _register_accessor(name, Index) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/api.py new file mode 100644 index 0000000000000000000000000000000000000000..2cfe5ffc0170d61bca248bca43dda7c2f082496f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/api.py @@ -0,0 +1,140 @@ +from pandas._libs import ( + NaT, + Period, + Timedelta, + Timestamp, +) +from pandas._libs.missing import NA + +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import ( + isna, + isnull, + notna, + notnull, +) + +from pandas.core.algorithms import ( + factorize, + unique, + value_counts, +) +from pandas.core.arrays import Categorical +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import ( + Float32Dtype, + Float64Dtype, +) +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.construction import array +from pandas.core.flags import Flags +from pandas.core.groupby import ( + Grouper, + NamedAgg, +) +from pandas.core.indexes.api import ( + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, +) +from pandas.core.indexes.datetimes import ( + bdate_range, + date_range, +) +from pandas.core.indexes.interval import ( + Interval, + interval_range, +) +from pandas.core.indexes.period import period_range +from pandas.core.indexes.timedeltas import timedelta_range +from pandas.core.indexing import IndexSlice +from pandas.core.series import Series +from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric +from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip + +__all__ = [ + "array", + "ArrowDtype", + "bdate_range", + "BooleanDtype", + "Categorical", + "CategoricalDtype", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "date_range", + "DatetimeIndex", + "DatetimeTZDtype", + "factorize", + "Flags", + "Float32Dtype", + "Float64Dtype", + "Grouper", + "Index", + "IndexSlice", + "Int16Dtype", + "Int32Dtype", + "Int64Dtype", + "Int8Dtype", + "Interval", + "IntervalDtype", + "IntervalIndex", + "interval_range", + "isna", + "isnull", + "MultiIndex", + "NA", + "NamedAgg", + "NaT", + "notna", + "notnull", + "Period", + "PeriodDtype", + "PeriodIndex", + "period_range", + "RangeIndex", + "Series", + "set_eng_float_format", + "StringDtype", + "Timedelta", + "TimedeltaIndex", + "timedelta_range", + "Timestamp", + "to_datetime", + "to_numeric", + "to_timedelta", + "UInt16Dtype", + "UInt32Dtype", + "UInt64Dtype", + "UInt8Dtype", + "unique", + "value_counts", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/apply.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/apply.py new file mode 100644 index 0000000000000000000000000000000000000000..25a71ce5b5f4fe15bc746da43681321bcb9594b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/apply.py @@ -0,0 +1,2062 @@ +from __future__ import annotations + +import abc +from collections import defaultdict +import functools +from functools import partial +import inspect +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._config import option_context + +from pandas._libs import lib +from pandas._libs.internals import BlockValuesRefs +from pandas._typing import ( + AggFuncType, + AggFuncTypeBase, + AggFuncTypeDict, + AggObjType, + Axis, + AxisInt, + NDFrameT, + npt, +) +from pandas.compat._optional import import_optional_dependency +from pandas.errors import SpecificationError +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import is_nested_object +from pandas.core.dtypes.common import ( + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_numeric_dtype, + is_sequence, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCNDFrame, + ABCSeries, +) + +from pandas.core._numba.executor import generate_apply_looper +import pandas.core.common as com +from pandas.core.construction import ensure_wrapped_if_datetimelike + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Hashable, + Iterable, + MutableMapping, + Sequence, + ) + + from pandas import ( + DataFrame, + Index, + Series, + ) + from pandas.core.groupby import GroupBy + from pandas.core.resample import Resampler + from pandas.core.window.rolling import BaseWindow + + +ResType = dict[int, Any] + + +def frame_apply( + obj: DataFrame, + func: AggFuncType, + axis: Axis = 0, + raw: bool = False, + result_type: str | None = None, + by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, + args=None, + kwargs=None, +) -> FrameApply: + """construct and return a row or column based frame apply object""" + axis = obj._get_axis_number(axis) + klass: type[FrameApply] + if axis == 0: + klass = FrameRowApply + elif axis == 1: + klass = FrameColumnApply + + _, func, _, _ = reconstruct_func(func, **kwargs) + assert func is not None + + return klass( + obj, + func, + raw=raw, + result_type=result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + + +class Apply(metaclass=abc.ABCMeta): + axis: AxisInt + + def __init__( + self, + obj: AggObjType, + func: AggFuncType, + raw: bool, + result_type: str | None, + *, + by_row: Literal[False, "compat", "_compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, + args, + kwargs, + ) -> None: + self.obj = obj + self.raw = raw + + assert by_row is False or by_row in ["compat", "_compat"] + self.by_row = by_row + + self.args = args or () + self.kwargs = kwargs or {} + + self.engine = engine + self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + if result_type not in [None, "reduce", "broadcast", "expand"]: + raise ValueError( + "invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}" + ) + + self.result_type = result_type + + self.func = func + + @abc.abstractmethod + def apply(self) -> DataFrame | Series: + pass + + @abc.abstractmethod + def agg_or_apply_list_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + pass + + @abc.abstractmethod + def agg_or_apply_dict_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + pass + + def agg(self) -> DataFrame | Series | None: + """ + Provide an implementation for the aggregators. + + Returns + ------- + Result of aggregation, or None if agg cannot be performed by + this method. + """ + obj = self.obj + func = self.func + args = self.args + kwargs = self.kwargs + + if isinstance(func, str): + return self.apply_str() + + if is_dict_like(func): + return self.agg_dict_like() + elif is_list_like(func): + # we require a list, but not a 'str' + return self.agg_list_like() + + if callable(func): + f = com.get_cython_func(func) + if f and not args and not kwargs: + warn_alias_replacement(obj, func, f) + return getattr(obj, f)() + + # caller can react + return None + + def transform(self) -> DataFrame | Series: + """ + Transform a DataFrame or Series. + + Returns + ------- + DataFrame or Series + Result of applying ``func`` along the given axis of the + Series or DataFrame. + + Raises + ------ + ValueError + If the transform function fails or does not transform. + """ + obj = self.obj + func = self.func + axis = self.axis + args = self.args + kwargs = self.kwargs + + is_series = obj.ndim == 1 + + if obj._get_axis_number(axis) == 1: + assert not is_series + return obj.T.transform(func, 0, *args, **kwargs).T + + if is_list_like(func) and not is_dict_like(func): + func = cast(list[AggFuncTypeBase], func) + # Convert func equivalent dict + if is_series: + func = {com.get_callable_name(v) or v: v for v in func} + else: + func = {col: func for col in obj} + + if is_dict_like(func): + func = cast(AggFuncTypeDict, func) + return self.transform_dict_like(func) + + # func is either str or callable + func = cast(AggFuncTypeBase, func) + try: + result = self.transform_str_or_callable(func) + except TypeError: + raise + except Exception as err: + raise ValueError("Transform function failed") from err + + # Functions that transform may return empty Series/DataFrame + # when the dtype is not appropriate + if ( + isinstance(result, (ABCSeries, ABCDataFrame)) + and result.empty + and not obj.empty + ): + raise ValueError("Transform function failed") + # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type + # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy, + # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame, + # Series]" + if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( + obj.index # type: ignore[arg-type] + ): + raise ValueError("Function did not transform") + + return result + + def transform_dict_like(self, func) -> DataFrame: + """ + Compute transform in the case of a dict-like func + """ + from pandas.core.reshape.concat import concat + + obj = self.obj + args = self.args + kwargs = self.kwargs + + # transform is currently only for Series/DataFrame + assert isinstance(obj, ABCNDFrame) + + if len(func) == 0: + raise ValueError("No transform functions were provided") + + func = self.normalize_dictlike_arg("transform", obj, func) + + results: dict[Hashable, DataFrame | Series] = {} + for name, how in func.items(): + colg = obj._gotitem(name, ndim=1) + results[name] = colg.transform(how, 0, *args, **kwargs) + return concat(results, axis=1) + + def transform_str_or_callable(self, func) -> DataFrame | Series: + """ + Compute transform in the case of a string or callable func + """ + obj = self.obj + args = self.args + kwargs = self.kwargs + + if isinstance(func, str): + return self._apply_str(obj, func, *args, **kwargs) + + if not args and not kwargs: + f = com.get_cython_func(func) + if f: + warn_alias_replacement(obj, func, f) + return getattr(obj, f)() + + # Two possible ways to use a UDF - apply or call directly + try: + return obj.apply(func, args=args, **kwargs) + except Exception: + return func(obj, *args, **kwargs) + + def agg_list_like(self) -> DataFrame | Series: + """ + Compute aggregation in the case of a list-like argument. + + Returns + ------- + Result of aggregation. + """ + return self.agg_or_apply_list_like(op_name="agg") + + def compute_list_like( + self, + op_name: Literal["agg", "apply"], + selected_obj: Series | DataFrame, + kwargs: dict[str, Any], + ) -> tuple[list[Hashable] | Index, list[Any]]: + """ + Compute agg/apply results for like-like input. + + Parameters + ---------- + op_name : {"agg", "apply"} + Operation being performed. + selected_obj : Series or DataFrame + Data to perform operation on. + kwargs : dict + Keyword arguments to pass to the functions. + + Returns + ------- + keys : list[Hashable] or Index + Index labels for result. + results : list + Data for result. When aggregating with a Series, this can contain any + Python objects. + """ + func = cast(list[AggFuncTypeBase], self.func) + obj = self.obj + + results = [] + keys = [] + + # degenerate case + if selected_obj.ndim == 1: + for a in func: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + args = ( + [self.axis, *self.args] + if include_axis(op_name, colg) + else self.args + ) + new_res = getattr(colg, op_name)(a, *args, **kwargs) + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + else: + indices = [] + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + args = ( + [self.axis, *self.args] + if include_axis(op_name, colg) + else self.args + ) + new_res = getattr(colg, op_name)(func, *args, **kwargs) + results.append(new_res) + indices.append(index) + # error: Incompatible types in assignment (expression has type "Any | + # Index", variable has type "list[Any | Callable[..., Any] | str]") + keys = selected_obj.columns.take(indices) # type: ignore[assignment] + + return keys, results + + def wrap_results_list_like( + self, keys: Iterable[Hashable], results: list[Series | DataFrame] + ): + from pandas.core.reshape.concat import concat + + obj = self.obj + + try: + return concat(results, keys=keys, axis=1, sort=False) + except TypeError as err: + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + if is_nested_object(result): + raise ValueError( + "cannot combine transform and aggregation operations" + ) from err + return result + + def agg_dict_like(self) -> DataFrame | Series: + """ + Compute aggregation in the case of a dict-like argument. + + Returns + ------- + Result of aggregation. + """ + return self.agg_or_apply_dict_like(op_name="agg") + + def compute_dict_like( + self, + op_name: Literal["agg", "apply"], + selected_obj: Series | DataFrame, + selection: Hashable | Sequence[Hashable], + kwargs: dict[str, Any], + ) -> tuple[list[Hashable], list[Any]]: + """ + Compute agg/apply results for dict-like input. + + Parameters + ---------- + op_name : {"agg", "apply"} + Operation being performed. + selected_obj : Series or DataFrame + Data to perform operation on. + selection : hashable or sequence of hashables + Used by GroupBy, Window, and Resample if selection is applied to the object. + kwargs : dict + Keyword arguments to pass to the functions. + + Returns + ------- + keys : list[hashable] + Index labels for result. + results : list + Data for result. When aggregating with a Series, this can contain any + Python object. + """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + func = cast(AggFuncTypeDict, self.func) + func = self.normalize_dictlike_arg(op_name, selected_obj, func) + + is_non_unique_col = ( + selected_obj.ndim == 2 + and selected_obj.columns.nunique() < len(selected_obj.columns) + ) + + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(selection, ndim=1) + results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] + keys = list(func.keys()) + elif not is_groupby and is_non_unique_col: + # key used for column selection and output + # GH#51099 + results = [] + keys = [] + for key, how in func.items(): + indices = selected_obj.columns.get_indexer_for([key]) + labels = selected_obj.columns.take(indices) + label_to_indices = defaultdict(list) + for index, label in zip(indices, labels): + label_to_indices[label].append(index) + + key_data = [ + getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwargs) + for label, indices in label_to_indices.items() + for indice in indices + ] + + keys += [key] * len(key_data) + results += key_data + else: + # key used for column selection and output + results = [ + getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) + for key, how in func.items() + ] + keys = list(func.keys()) + + return keys, results + + def wrap_results_dict_like( + self, + selected_obj: Series | DataFrame, + result_index: list[Hashable], + result_data: list, + ): + from pandas import Index + from pandas.core.reshape.concat import concat + + obj = self.obj + + # Avoid making two isinstance calls in all and any below + is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] + + if all(is_ndframe): + results = dict(zip(result_index, result_data)) + keys_to_use: Iterable[Hashable] + keys_to_use = [k for k in result_index if not results[k].empty] + # Have to check, if at least one DataFrame is not empty. + keys_to_use = keys_to_use if keys_to_use != [] else result_index + if selected_obj.ndim == 2: + # keys are columns, so we can preserve names + ktu = Index(keys_to_use) + ktu._set_names(selected_obj.columns.names) + keys_to_use = ktu + + axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 + result = concat( + {k: results[k] for k in keys_to_use}, + axis=axis, + keys=keys_to_use, + ) + elif any(is_ndframe): + # There is a mix of NDFrames and scalars + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) + else: + from pandas import Series + + # we have a list of scalars + # GH 36212 use name only if obj is a series + if obj.ndim == 1: + obj = cast("Series", obj) + name = obj.name + else: + name = None + + result = Series(result_data, index=result_index, name=name) + + return result + + def apply_str(self) -> DataFrame | Series: + """ + Compute apply in case of a string. + + Returns + ------- + result: Series or DataFrame + """ + # Caller is responsible for checking isinstance(self.f, str) + func = cast(str, self.func) + + obj = self.obj + + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + + # Support for `frame.transform('method')` + # Some methods (shift, etc.) require the axis argument, others + # don't, so inspect and insert if necessary. + method = getattr(obj, func, None) + if callable(method): + sig = inspect.getfullargspec(method) + arg_names = (*sig.args, *sig.kwonlyargs) + if self.axis != 0 and ( + "axis" not in arg_names or func in ("corrwith", "skew") + ): + raise ValueError(f"Operation {func} does not support axis=1") + if "axis" in arg_names: + if isinstance(obj, (SeriesGroupBy, DataFrameGroupBy)): + # Try to avoid FutureWarning for deprecated axis keyword; + # If self.axis matches the axis we would get by not passing + # axis, we safely exclude the keyword. + + default_axis = 0 + if func in ["idxmax", "idxmin"]: + # DataFrameGroupBy.idxmax, idxmin axis defaults to self.axis, + # whereas other axis keywords default to 0 + default_axis = self.obj.axis + + if default_axis != self.axis: + self.kwargs["axis"] = self.axis + else: + self.kwargs["axis"] = self.axis + return self._apply_str(obj, func, *self.args, **self.kwargs) + + def apply_list_or_dict_like(self) -> DataFrame | Series: + """ + Compute apply in case of a list-like or dict-like. + + Returns + ------- + result: Series, DataFrame, or None + Result when self.func is a list-like or dict-like, None otherwise. + """ + + if self.engine == "numba": + raise NotImplementedError( + "The 'numba' engine doesn't support list-like/" + "dict likes of callables yet." + ) + + if self.axis == 1 and isinstance(self.obj, ABCDataFrame): + return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T + + func = self.func + kwargs = self.kwargs + + if is_dict_like(func): + result = self.agg_or_apply_dict_like(op_name="apply") + else: + result = self.agg_or_apply_list_like(op_name="apply") + + result = reconstruct_and_relabel_result(result, func, **kwargs) + + return result + + def normalize_dictlike_arg( + self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict + ) -> AggFuncTypeDict: + """ + Handler for dict-like argument. + + Ensures that necessary columns exist if obj is a DataFrame, and + that a nested renamer is not passed. Also normalizes to all lists + when values consists of a mix of list and non-lists. + """ + assert how in ("apply", "agg", "transform") + + # Can't use func.values(); wouldn't work for a Series + if ( + how == "agg" + and isinstance(obj, ABCSeries) + and any(is_list_like(v) for _, v in func.items()) + ) or (any(is_dict_like(v) for _, v in func.items())): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + if obj.ndim != 1: + # Check for missing columns on a frame + from pandas import Index + + cols = Index(list(func.keys())).difference(obj.columns, sort=True) + if len(cols) > 0: + raise KeyError(f"Column(s) {list(cols)} do not exist") + + aggregator_types = (list, tuple, dict) + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + # Cannot use func.values() because arg may be a Series + if any(isinstance(x, aggregator_types) for _, x in func.items()): + new_func: AggFuncTypeDict = {} + for k, v in func.items(): + if not isinstance(v, aggregator_types): + new_func[k] = [v] + else: + new_func[k] = v + func = new_func + return func + + def _apply_str(self, obj, func: str, *args, **kwargs): + """ + if arg is a string, then try to operate on it: + - try to find a function (or attribute) on obj + - try to find a numpy function + - raise + """ + assert isinstance(func, str) + + if hasattr(obj, func): + f = getattr(obj, func) + if callable(f): + return f(*args, **kwargs) + + # people may aggregate on a non-callable attribute + # but don't let them think they can pass args to it + assert len(args) == 0 + assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + return f + elif hasattr(np, func) and hasattr(obj, "__array__"): + # in particular exclude Window + f = getattr(np, func) + return f(obj, *args, **kwargs) + else: + msg = f"'{func}' is not a valid function for '{type(obj).__name__}' object" + raise AttributeError(msg) + + +class NDFrameApply(Apply): + """ + Methods shared by FrameApply and SeriesApply but + not GroupByApply or ResamplerWindowApply + """ + + obj: DataFrame | Series + + @property + def index(self) -> Index: + return self.obj.index + + @property + def agg_axis(self) -> Index: + return self.obj._get_agg_axis(self.axis) + + def agg_or_apply_list_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + obj = self.obj + kwargs = self.kwargs + + if op_name == "apply": + if isinstance(self, FrameApply): + by_row = self.by_row + + elif isinstance(self, SeriesApply): + by_row = "_compat" if self.by_row else False + else: + by_row = False + kwargs = {**kwargs, "by_row": by_row} + + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + + keys, results = self.compute_list_like(op_name, obj, kwargs) + result = self.wrap_results_list_like(keys, results) + return result + + def agg_or_apply_dict_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + assert op_name in ["agg", "apply"] + obj = self.obj + + kwargs = {} + if op_name == "apply": + by_row = "_compat" if self.by_row else False + kwargs.update({"by_row": by_row}) + + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + + selection = None + result_index, result_data = self.compute_dict_like( + op_name, obj, selection, kwargs + ) + result = self.wrap_results_dict_like(obj, result_index, result_data) + return result + + +class FrameApply(NDFrameApply): + obj: DataFrame + + def __init__( + self, + obj: AggObjType, + func: AggFuncType, + raw: bool, + result_type: str | None, + *, + by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, + args, + kwargs, + ) -> None: + if by_row is not False and by_row != "compat": + raise ValueError(f"by_row={by_row} not allowed") + super().__init__( + obj, + func, + raw, + result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + + # --------------------------------------------------------------- + # Abstract Methods + + @property + @abc.abstractmethod + def result_index(self) -> Index: + pass + + @property + @abc.abstractmethod + def result_columns(self) -> Index: + pass + + @property + @abc.abstractmethod + def series_generator(self) -> Generator[Series, None, None]: + pass + + @staticmethod + @functools.cache + @abc.abstractmethod + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + pass + + @abc.abstractmethod + def apply_with_numba(self): + pass + + def validate_values_for_numba(self): + # Validate column dtyps all OK + for colname, dtype in self.obj.dtypes.items(): + if not is_numeric_dtype(dtype): + raise ValueError( + f"Column {colname} must have a numeric dtype. " + f"Found '{dtype}' instead" + ) + if is_extension_array_dtype(dtype): + raise ValueError( + f"Column {colname} is backed by an extension array, " + f"which is not supported by the numba engine." + ) + + @abc.abstractmethod + def wrap_results_for_axis( + self, results: ResType, res_index: Index + ) -> DataFrame | Series: + pass + + # --------------------------------------------------------------- + + @property + def res_columns(self) -> Index: + return self.result_columns + + @property + def columns(self) -> Index: + return self.obj.columns + + @cache_readonly + def values(self): + return self.obj.values + + def apply(self) -> DataFrame | Series: + """compute the results""" + + # dispatch to handle list-like or dict-like + if is_list_like(self.func): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) + return self.apply_list_or_dict_like() + + # all empty + if len(self.columns) == 0 and len(self.index) == 0: + return self.apply_empty_result() + + # string dispatch + if isinstance(self.func, str): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support using " + "a string as the callable function" + ) + return self.apply_str() + + # ufunc + elif isinstance(self.func, np.ufunc): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support " + "using a numpy ufunc as the callable function" + ) + with np.errstate(all="ignore"): + results = self.obj._mgr.apply("apply", func=self.func) + # _constructor will retain self.index and self.columns + return self.obj._constructor_from_mgr(results, axes=results.axes) + + # broadcasting + if self.result_type == "broadcast": + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support result_type='broadcast'" + ) + return self.apply_broadcast(self.obj) + + # one axis empty + elif not all(self.obj.shape): + return self.apply_empty_result() + + # raw + elif self.raw: + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) + + return self.apply_standard() + + def agg(self): + obj = self.obj + axis = self.axis + + # TODO: Avoid having to change state + self.obj = self.obj if self.axis == 0 else self.obj.T + self.axis = 0 + + result = None + try: + result = super().agg() + finally: + self.obj = obj + self.axis = axis + + if axis == 1: + result = result.T if result is not None else result + + if result is None: + result = self.obj.apply(self.func, axis, args=self.args, **self.kwargs) + + return result + + def apply_empty_result(self): + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + assert callable(self.func) + + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ["reduce", None]: + return self.obj.copy() + + # we may need to infer + should_reduce = self.result_type == "reduce" + + from pandas import Series + + if not should_reduce: + try: + if self.axis == 0: + r = self.func( + Series([], dtype=np.float64), *self.args, **self.kwargs + ) + else: + r = self.func( + Series(index=self.columns, dtype=np.float64), + *self.args, + **self.kwargs, + ) + except Exception: + pass + else: + should_reduce = not isinstance(r, Series) + + if should_reduce: + if len(self.agg_axis): + r = self.func(Series([], dtype=np.float64), *self.args, **self.kwargs) + else: + r = np.nan + + return self.obj._constructor_sliced(r, index=self.agg_axis) + else: + return self.obj.copy() + + def apply_raw(self, engine="python", engine_kwargs=None): + """apply to the values as a numpy array""" + + def wrap_function(func): + """ + Wrap user supplied function to work around numpy issue. + + see https://github.com/numpy/numpy/issues/8352 + """ + + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + if engine == "numba": + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs # type: ignore[arg-type] + ) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) + + # TODO: mixed type case + if result.ndim == 2: + return self.obj._constructor(result, index=self.index, columns=self.columns) + else: + return self.obj._constructor_sliced(result, index=self.agg_axis) + + def apply_broadcast(self, target: DataFrame) -> DataFrame: + assert callable(self.func) + + result_values = np.empty_like(target.values) + + # axis which we want to compare compliance + result_compare = target.shape[0] + + for i, col in enumerate(target.columns): + res = self.func(target[col], *self.args, **self.kwargs) + ares = np.asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + if ares == 1: + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") + + result_values[:, i] = res + + # we *always* preserve the original index / columns + result = self.obj._constructor( + result_values, index=target.index, columns=target.columns + ) + return result + + def apply_standard(self): + if self.engine == "python": + results, res_index = self.apply_series_generator() + else: + results, res_index = self.apply_series_numba() + + # wrap results + return self.wrap_results(results, res_index) + + def apply_series_generator(self) -> tuple[ResType, Index]: + assert callable(self.func) + + series_gen = self.series_generator + res_index = self.result_index + + results = {} + + with option_context("mode.chained_assignment", None): + for i, v in enumerate(series_gen): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.func(v, *self.args, **self.kwargs) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) + + return results, res_index + + def apply_series_numba(self): + if self.engine_kwargs.get("parallel", False): + raise NotImplementedError( + "Parallel apply is not supported when raw=False and engine='numba'" + ) + if not self.obj.index.is_unique or not self.columns.is_unique: + raise NotImplementedError( + "The index/columns must be unique when raw=False and engine='numba'" + ) + self.validate_values_for_numba() + results = self.apply_with_numba() + return results, self.result_index + + def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: + from pandas import Series + + # see if we can infer the results + if len(results) > 0 and 0 in results and is_sequence(results[0]): + return self.wrap_results_for_axis(results, res_index) + + # dict of scalars + + # the default dtype of an empty Series is `object`, but this + # code can be hit by df.mean() where the result should have dtype + # float64 even if it's an empty Series. + constructor_sliced = self.obj._constructor_sliced + if len(results) == 0 and constructor_sliced is Series: + result = constructor_sliced(results, dtype=np.float64) + else: + result = constructor_sliced(results) + result.index = res_index + + return result + + def apply_str(self) -> DataFrame | Series: + # Caller is responsible for checking isinstance(self.func, str) + # TODO: GH#39993 - Avoid special-casing by replacing with lambda + if self.func == "size": + # Special-cased because DataFrame.size returns a single scalar + obj = self.obj + value = obj.shape[self.axis] + return obj._constructor_sliced(value, index=self.agg_axis) + return super().apply_str() + + +class FrameRowApply(FrameApply): + axis: AxisInt = 0 + + @property + def series_generator(self) -> Generator[Series, None, None]: + return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) + + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") + from pandas import Series + + # Import helper from extensions to cast string object -> np strings + # Note: This also has the side effect of loading our numba extensions + from pandas.core._numba.extensions import maybe_cast_str + + jitted_udf = numba.extending.register_jitable(func) + + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names, df_index): + results = {} + for j in range(values.shape[1]): + # Create the series + ser = Series( + values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) + ) + results[j] = jitted_udf(ser) + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) + from pandas.core._numba.extensions import set_numba_data + + index = self.obj.index + if index.dtype == "string": + index = index.astype(object) + + columns = self.obj.columns + if columns.dtype == "string": + columns = columns.astype(object) + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + with set_numba_data(index) as index, set_numba_data(columns) as columns: + res = dict(nb_func(self.values, columns, index)) + return res + + @property + def result_index(self) -> Index: + return self.columns + + @property + def result_columns(self) -> Index: + return self.index + + def wrap_results_for_axis( + self, results: ResType, res_index: Index + ) -> DataFrame | Series: + """return the results for the rows""" + + if self.result_type == "reduce": + # e.g. test_apply_dict GH#8735 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + + elif self.result_type is None and all( + isinstance(x, dict) for x in results.values() + ): + # Our operation was a to_dict op e.g. + # test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + + try: + result = self.obj._constructor(data=results) + except ValueError as err: + if "All arrays must be of the same length" in str(err): + # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] + # see test_agg_listlike_result GH#29587 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + else: + raise + + if not isinstance(results[0], ABCSeries): + if len(result.index) == len(self.res_columns): + result.index = self.res_columns + + if len(result.columns) == len(res_index): + result.columns = res_index + + return result + + +class FrameColumnApply(FrameApply): + axis: AxisInt = 1 + + def apply_broadcast(self, target: DataFrame) -> DataFrame: + result = super().apply_broadcast(target.T) + return result.T + + @property + def series_generator(self) -> Generator[Series, None, None]: + values = self.values + values = ensure_wrapped_if_datetimelike(values) + assert len(values) > 0 + + # We create one Series object, and will swap out the data inside + # of it. Kids: don't do this at home. + ser = self.obj._ixs(0, axis=0) + mgr = ser._mgr + + is_view = mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + + if isinstance(ser.dtype, ExtensionDtype): + # values will be incorrect for this block + # TODO(EA2D): special case would be unnecessary with 2D EAs + obj = self.obj + for i in range(len(obj)): + yield obj._ixs(i, axis=0) + + else: + for arr, name in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr + mgr.set_values(arr) + object.__setattr__(ser, "_name", name) + if not is_view: + # In apply_series_generator we store the a shallow copy of the + # result, which potentially increases the ref count of this reused + # `ser` object (depending on the result of the applied function) + # -> if that happened and `ser` is already a copy, then we reset + # the refs here to avoid triggering a unnecessary CoW inside the + # applied function (https://github.com/pandas-dev/pandas/pull/56212) + mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr] + yield ser + + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") + from pandas import Series + from pandas.core._numba.extensions import maybe_cast_str + + jitted_udf = numba.extending.register_jitable(func) + + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names_index, index): + results = {} + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. + for i in range(values.shape[0]): + # Create the series + # TODO: values corrupted without the copy + ser = Series( + values[i].copy(), + index=col_names_index, + name=maybe_cast_str(index[i]), + ) + results[i] = jitted_udf(ser) + + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) + + from pandas.core._numba.extensions import set_numba_data + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + with set_numba_data(self.obj.index) as index, set_numba_data( + self.columns + ) as columns: + res = dict(nb_func(self.values, columns, index)) + + return res + + @property + def result_index(self) -> Index: + return self.index + + @property + def result_columns(self) -> Index: + return self.columns + + def wrap_results_for_axis( + self, results: ResType, res_index: Index + ) -> DataFrame | Series: + """return the results for the columns""" + result: DataFrame | Series + + # we have requested to expand + if self.result_type == "expand": + result = self.infer_to_same_shape(results, res_index) + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + result = self.obj._constructor_sliced(results) + result.index = res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape(results, res_index) + + return result + + def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: + """infer the results to the same shape as the input object""" + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = res_index + + # infer dtypes + result = result.infer_objects(copy=False) + + return result + + +class SeriesApply(NDFrameApply): + obj: Series + axis: AxisInt = 0 + by_row: Literal[False, "compat", "_compat"] # only relevant for apply() + + def __init__( + self, + obj: Series, + func: AggFuncType, + *, + convert_dtype: bool | lib.NoDefault = lib.no_default, + by_row: Literal[False, "compat", "_compat"] = "compat", + args, + kwargs, + ) -> None: + if convert_dtype is lib.no_default: + convert_dtype = True + else: + warnings.warn( + "the convert_dtype parameter is deprecated and will be removed in a " + "future version. Do ``ser.astype(object).apply()`` " + "instead if you want ``convert_dtype=False``.", + FutureWarning, + stacklevel=find_stack_level(), + ) + self.convert_dtype = convert_dtype + + super().__init__( + obj, + func, + raw=False, + result_type=None, + by_row=by_row, + args=args, + kwargs=kwargs, + ) + + def apply(self) -> DataFrame | Series: + obj = self.obj + + if len(obj) == 0: + return self.apply_empty_result() + + # dispatch to handle list-like or dict-like + if is_list_like(self.func): + return self.apply_list_or_dict_like() + + if isinstance(self.func, str): + # if we are a string, try to dispatch + return self.apply_str() + + if self.by_row == "_compat": + return self.apply_compat() + + # self.func is Callable + return self.apply_standard() + + def agg(self): + result = super().agg() + if result is None: + obj = self.obj + func = self.func + # string, list-like, and dict-like are entirely handled in super + assert callable(func) + + # GH53325: The setup below is just to keep current behavior while emitting a + # deprecation message. In the future this will all be replaced with a simple + # `result = f(self.obj, *self.args, **self.kwargs)`. + try: + result = obj.apply(func, args=self.args, **self.kwargs) + except (ValueError, AttributeError, TypeError): + result = func(obj, *self.args, **self.kwargs) + else: + msg = ( + f"using {func} in {type(obj).__name__}.agg cannot aggregate and " + f"has been deprecated. Use {type(obj).__name__}.transform to " + f"keep behavior unchanged." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + + return result + + def apply_empty_result(self) -> Series: + obj = self.obj + return obj._constructor(dtype=obj.dtype, index=obj.index).__finalize__( + obj, method="apply" + ) + + def apply_compat(self): + """compat apply method for funcs in listlikes and dictlikes. + + Used for each callable when giving listlikes and dictlikes of callables to + apply. Needed for compatibility with Pandas < v2.1. + + .. versionadded:: 2.1.0 + """ + obj = self.obj + func = self.func + + if callable(func): + f = com.get_cython_func(func) + if f and not self.args and not self.kwargs: + return obj.apply(func, by_row=False) + + try: + result = obj.apply(func, by_row="compat") + except (ValueError, AttributeError, TypeError): + result = obj.apply(func, by_row=False) + return result + + def apply_standard(self) -> DataFrame | Series: + # caller is responsible for ensuring that f is Callable + func = cast(Callable, self.func) + obj = self.obj + + if isinstance(func, np.ufunc): + with np.errstate(all="ignore"): + return func(obj, *self.args, **self.kwargs) + elif not self.by_row: + return func(obj, *self.args, **self.kwargs) + + if self.args or self.kwargs: + # _map_values does not support args/kwargs + def curried(x): + return func(x, *self.args, **self.kwargs) + + else: + curried = func + + # row-wise access + # apply doesn't have a `na_action` keyword and for backward compat reasons + # we need to give `na_action="ignore"` for categorical data. + # TODO: remove the `na_action="ignore"` when that default has been changed in + # Categorical (GH51645). + action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None + mapped = obj._map_values( + mapper=curried, na_action=action, convert=self.convert_dtype + ) + + if len(mapped) and isinstance(mapped[0], ABCSeries): + # GH#43986 Need to do list(mapped) in order to get treated as nested + # See also GH#25959 regarding EA support + return obj._constructor_expanddim(list(mapped), index=obj.index) + else: + return obj._constructor(mapped, index=obj.index).__finalize__( + obj, method="apply" + ) + + +class GroupByApply(Apply): + obj: GroupBy | Resampler | BaseWindow + + def __init__( + self, + obj: GroupBy[NDFrameT], + func: AggFuncType, + *, + args, + kwargs, + ) -> None: + kwargs = kwargs.copy() + self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0)) + super().__init__( + obj, + func, + raw=False, + result_type=None, + args=args, + kwargs=kwargs, + ) + + def apply(self): + raise NotImplementedError + + def transform(self): + raise NotImplementedError + + def agg_or_apply_list_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + obj = self.obj + kwargs = self.kwargs + if op_name == "apply": + kwargs = {**kwargs, "by_row": False} + + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + + if obj._selected_obj.ndim == 1: + # For SeriesGroupBy this matches _obj_with_exclusions + selected_obj = obj._selected_obj + else: + selected_obj = obj._obj_with_exclusions + + # Only set as_index=True on groupby objects, not Window or Resample + # that inherit from this class. + with com.temp_setattr( + obj, "as_index", True, condition=hasattr(obj, "as_index") + ): + keys, results = self.compute_list_like(op_name, selected_obj, kwargs) + result = self.wrap_results_list_like(keys, results) + return result + + def agg_or_apply_dict_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + + assert op_name in ["agg", "apply"] + + obj = self.obj + kwargs = {} + if op_name == "apply": + by_row = "_compat" if self.by_row else False + kwargs.update({"by_row": by_row}) + + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + + selected_obj = obj._selected_obj + selection = obj._selection + + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + + # Numba Groupby engine/engine-kwargs passthrough + if is_groupby: + engine = self.kwargs.get("engine", None) + engine_kwargs = self.kwargs.get("engine_kwargs", None) + kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs}) + + with com.temp_setattr( + obj, "as_index", True, condition=hasattr(obj, "as_index") + ): + result_index, result_data = self.compute_dict_like( + op_name, selected_obj, selection, kwargs + ) + result = self.wrap_results_dict_like(selected_obj, result_index, result_data) + return result + + +class ResamplerWindowApply(GroupByApply): + axis: AxisInt = 0 + obj: Resampler | BaseWindow + + def __init__( + self, + obj: Resampler | BaseWindow, + func: AggFuncType, + *, + args, + kwargs, + ) -> None: + super(GroupByApply, self).__init__( + obj, + func, + raw=False, + result_type=None, + args=args, + kwargs=kwargs, + ) + + def apply(self): + raise NotImplementedError + + def transform(self): + raise NotImplementedError + + +def reconstruct_func( + func: AggFuncType | None, **kwargs +) -> tuple[bool, AggFuncType, tuple[str, ...] | None, npt.NDArray[np.intp] | None]: + """ + This is the internal function to reconstruct func given if there is relabeling + or not and also normalize the keyword to get new order of columns. + + If named aggregation is applied, `func` will be None, and kwargs contains the + column and aggregation function information to be parsed; + If named aggregation is not applied, `func` is either string (e.g. 'min') or + Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name + and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) + + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + + Parameters + ---------- + func: agg function (e.g. 'min' or Callable) or list of agg functions + (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). + **kwargs: dict, kwargs used in is_multi_agg_with_relabel and + normalize_keyword_aggregation function for relabelling + + Returns + ------- + relabelling: bool, if there is relabelling or not + func: normalized and mangled func + columns: tuple of column names + order: array of columns indices + + Examples + -------- + >>> reconstruct_func(None, **{"foo": ("col", "min")}) + (True, defaultdict(, {'col': ['min']}), ('foo',), array([0])) + + >>> reconstruct_func("min") + (False, 'min', None, None) + """ + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + columns: tuple[str, ...] | None = None + order: npt.NDArray[np.intp] | None = None + + if not relabeling: + if isinstance(func, list) and len(func) > len(set(func)): + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column names " + "assigned" + ) + if func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + if relabeling: + # error: Incompatible types in assignment (expression has type + # "MutableMapping[Hashable, list[Callable[..., Any] | str]]", variable has type + # "Callable[..., Any] | str | list[Callable[..., Any] | str] | + # MutableMapping[Hashable, Callable[..., Any] | str | list[Callable[..., Any] | + # str]] | None") + func, columns, order = normalize_keyword_aggregation( # type: ignore[assignment] + kwargs + ) + assert func is not None + + return relabeling, func, columns, order + + +def is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> is_multi_agg_with_relabel(a="max") + False + >>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min")) + True + >>> is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def normalize_keyword_aggregation( + kwargs: dict, +) -> tuple[ + MutableMapping[Hashable, list[AggFuncTypeBase]], + tuple[str, ...], + npt.NDArray[np.intp], +]: + """ + Normalize user-provided "named aggregation" kwargs. + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : tuple[str, ...] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> normalize_keyword_aggregation({"output": ("input", "sum")}) + (defaultdict(, {'input': ['sum']}), ('output',), array([0])) + """ + from pandas.core.indexes.base import Index + + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] + aggspec = defaultdict(list) + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for column, aggfunc in pairs: + aggspec[column].append(aggfunc) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique_kwarg_list(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique_kwarg_list(aggspec_order) + + # get the new index of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique_kwarg_list( + seq: Sequence[tuple[Any, Any]] +) -> Sequence[tuple[Any, Any]]: + """ + Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] + >>> _make_unique_kwarg_list(kwarg_list) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], f"{pair[1]}_{seq[:i].count(pair)}") if seq.count(pair) > 1 else pair + for i, pair in enumerate(seq) + ] + + +def relabel_result( + result: DataFrame | Series, + func: dict[str, list[Callable | str]], + columns: Iterable[Hashable], + order: Iterable[int], +) -> dict[Hashable, Series]: + """ + Internal function to reorder result if relabelling is True for + dataframe.agg, and return the reordered result in dict. + + Parameters: + ---------- + result: Result from aggregation + func: Dict of (column name, funcs) + columns: New columns name for relabelling + order: New order for relabelling + + Examples + -------- + >>> from pandas.core.apply import relabel_result + >>> result = pd.DataFrame( + ... {"A": [np.nan, 2, np.nan], "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}, + ... index=["max", "mean", "min"] + ... ) + >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} + >>> columns = ("foo", "aab", "bar", "dat") + >>> order = [0, 1, 2, 3] + >>> result_in_dict = relabel_result(result, funcs, columns, order) + >>> pd.DataFrame(result_in_dict, index=columns) + A C B + foo 2.0 NaN NaN + aab NaN 6.0 NaN + bar NaN NaN 4.0 + dat NaN NaN 2.5 + """ + from pandas.core.indexes.base import Index + + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] + reordered_result_in_dict: dict[Hashable, Series] = {} + idx = 0 + + reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 + for col, fun in func.items(): + s = result[col].dropna() + + # In the `_aggregate`, the callable names are obtained and used in `result`, and + # these names are ordered alphabetically. e.g. + # C2 C1 + # 1 NaN + # amax NaN 4.0 + # max NaN 4.0 + # sum 18.0 6.0 + # Therefore, the order of functions for each column could be shuffled + # accordingly so need to get the callable name if it is not parsed names, and + # reorder the aggregated result for each column. + # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is + # [sum, ], but in `result`, it will be [, sum], and we need to + # reorder so that aggregated values map to their functions regarding the order. + + # However there is only one column being used for aggregation, not need to + # reorder since the index is not sorted, and keep as is in `funcs`, e.g. + # A + # min 1.0 + # mean 1.5 + # mean 1.5 + if reorder_mask: + fun = [ + com.get_callable_name(f) if not isinstance(f, str) else f for f in fun + ] + col_idx_order = Index(s.index).get_indexer(fun) + s = s.iloc[col_idx_order] + + # assign the new user-provided "named aggregation" as index names, and reindex + # it based on the whole user-provided names. + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result_in_dict[col] = s.reindex(columns, copy=False) + idx = idx + len(fun) + return reordered_result_in_dict + + +def reconstruct_and_relabel_result(result, func, **kwargs) -> DataFrame | Series: + from pandas import DataFrame + + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + + if relabeling: + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + + # For the return values of reconstruct_func, if relabeling is + # False, columns and order will be None. + assert columns is not None + assert order is not None + + result_in_dict = relabel_result(result, func, columns, order) + result = DataFrame(result_in_dict, index=columns) + + return result + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> maybe_mangle_lambdas('sum') + 'sum' + >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderedDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec + + +def validate_func_kwargs( + kwargs: dict, +) -> tuple[list[str], list[str | Callable[..., Any]]]: + """ + Validates types of user-provided "named aggregation" kwargs. + `TypeError` is raised if aggfunc is not `str` or callable. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + columns : List[str] + List of user-provided keys. + func : List[Union[str, callable[...,Any]]] + List of user-provided aggfuncs + + Examples + -------- + >>> validate_func_kwargs({'one': 'min', 'two': 'max'}) + (['one', 'two'], ['min', 'max']) + """ + tuple_given_message = "func is expected but received {} in **kwargs." + columns = list(kwargs) + func = [] + for col_func in kwargs.values(): + if not (isinstance(col_func, str) or callable(col_func)): + raise TypeError(tuple_given_message.format(type(col_func).__name__)) + func.append(col_func) + if not columns: + no_arg_message = "Must provide 'func' or named aggregation **kwargs." + raise TypeError(no_arg_message) + return columns, func + + +def include_axis(op_name: Literal["agg", "apply"], colg: Series | DataFrame) -> bool: + return isinstance(colg, ABCDataFrame) or ( + isinstance(colg, ABCSeries) and op_name == "agg" + ) + + +def warn_alias_replacement( + obj: AggObjType, + func: Callable, + alias: str, +) -> None: + if alias.startswith("np."): + full_alias = alias + else: + full_alias = f"{type(obj).__name__}.{alias}" + alias = f'"{alias}"' + warnings.warn( + f"The provided callable {func} is currently using " + f"{full_alias}. In a future version of pandas, " + f"the provided callable will be used directly. To keep current " + f"behavior pass the string {alias} instead.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arraylike.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arraylike.py new file mode 100644 index 0000000000000000000000000000000000000000..fd585b3e1946880ca79f97b0b6d4a2990bdd32d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arraylike.py @@ -0,0 +1,530 @@ +""" +Methods that can be shared by many array-like classes or subclasses: + Series + Index + ExtensionArray +""" +from __future__ import annotations + +import operator +from typing import Any + +import numpy as np + +from pandas._libs import lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op + +from pandas.core.dtypes.generic import ABCNDFrame + +from pandas.core import roperator +from pandas.core.construction import extract_array +from pandas.core.ops.common import unpack_zerodim_and_defer + +REDUCTION_ALIASES = { + "maximum": "max", + "minimum": "min", + "add": "sum", + "multiply": "prod", +} + + +class OpsMixin: + # ------------------------------------------------------------- + # Comparisons + + def _cmp_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + + @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + # ------------------------------------------------------------- + # Logical Methods + + def _logical_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__and__") + def __and__(self, other): + return self._logical_method(other, operator.and_) + + @unpack_zerodim_and_defer("__rand__") + def __rand__(self, other): + return self._logical_method(other, roperator.rand_) + + @unpack_zerodim_and_defer("__or__") + def __or__(self, other): + return self._logical_method(other, operator.or_) + + @unpack_zerodim_and_defer("__ror__") + def __ror__(self, other): + return self._logical_method(other, roperator.ror_) + + @unpack_zerodim_and_defer("__xor__") + def __xor__(self, other): + return self._logical_method(other, operator.xor) + + @unpack_zerodim_and_defer("__rxor__") + def __rxor__(self, other): + return self._logical_method(other, roperator.rxor) + + # ------------------------------------------------------------- + # Arithmetic Methods + + def _arith_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__add__") + def __add__(self, other): + """ + Get Addition of DataFrame and other, column-wise. + + Equivalent to ``DataFrame.add(other)``. + + Parameters + ---------- + other : scalar, sequence, Series, dict or DataFrame + Object to be added to the DataFrame. + + Returns + ------- + DataFrame + The result of adding ``other`` to DataFrame. + + See Also + -------- + DataFrame.add : Add a DataFrame and another object, with option for index- + or column-oriented addition. + + Examples + -------- + >>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]}, + ... index=['elk', 'moose']) + >>> df + height weight + elk 1.5 500 + moose 2.6 800 + + Adding a scalar affects all rows and columns. + + >>> df[['height', 'weight']] + 1.5 + height weight + elk 3.0 501.5 + moose 4.1 801.5 + + Each element of a list is added to a column of the DataFrame, in order. + + >>> df[['height', 'weight']] + [0.5, 1.5] + height weight + elk 2.0 501.5 + moose 3.1 801.5 + + Keys of a dictionary are aligned to the DataFrame, based on column names; + each value in the dictionary is added to the corresponding column. + + >>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5} + height weight + elk 2.0 501.5 + moose 3.1 801.5 + + When `other` is a :class:`Series`, the index of `other` is aligned with the + columns of the DataFrame. + + >>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height']) + >>> df[['height', 'weight']] + s1 + height weight + elk 3.0 500.5 + moose 4.1 800.5 + + Even when the index of `other` is the same as the index of the DataFrame, + the :class:`Series` will not be reoriented. If index-wise alignment is desired, + :meth:`DataFrame.add` should be used with `axis='index'`. + + >>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose']) + >>> df[['height', 'weight']] + s2 + elk height moose weight + elk NaN NaN NaN NaN + moose NaN NaN NaN NaN + + >>> df[['height', 'weight']].add(s2, axis='index') + height weight + elk 2.0 500.5 + moose 4.1 801.5 + + When `other` is a :class:`DataFrame`, both columns names and the + index are aligned. + + >>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]}, + ... index=['elk', 'moose', 'deer']) + >>> df[['height', 'weight']] + other + height weight + deer NaN NaN + elk 1.7 NaN + moose 3.0 NaN + """ + return self._arith_method(other, operator.add) + + @unpack_zerodim_and_defer("__radd__") + def __radd__(self, other): + return self._arith_method(other, roperator.radd) + + @unpack_zerodim_and_defer("__sub__") + def __sub__(self, other): + return self._arith_method(other, operator.sub) + + @unpack_zerodim_and_defer("__rsub__") + def __rsub__(self, other): + return self._arith_method(other, roperator.rsub) + + @unpack_zerodim_and_defer("__mul__") + def __mul__(self, other): + return self._arith_method(other, operator.mul) + + @unpack_zerodim_and_defer("__rmul__") + def __rmul__(self, other): + return self._arith_method(other, roperator.rmul) + + @unpack_zerodim_and_defer("__truediv__") + def __truediv__(self, other): + return self._arith_method(other, operator.truediv) + + @unpack_zerodim_and_defer("__rtruediv__") + def __rtruediv__(self, other): + return self._arith_method(other, roperator.rtruediv) + + @unpack_zerodim_and_defer("__floordiv__") + def __floordiv__(self, other): + return self._arith_method(other, operator.floordiv) + + @unpack_zerodim_and_defer("__rfloordiv") + def __rfloordiv__(self, other): + return self._arith_method(other, roperator.rfloordiv) + + @unpack_zerodim_and_defer("__mod__") + def __mod__(self, other): + return self._arith_method(other, operator.mod) + + @unpack_zerodim_and_defer("__rmod__") + def __rmod__(self, other): + return self._arith_method(other, roperator.rmod) + + @unpack_zerodim_and_defer("__divmod__") + def __divmod__(self, other): + return self._arith_method(other, divmod) + + @unpack_zerodim_and_defer("__rdivmod__") + def __rdivmod__(self, other): + return self._arith_method(other, roperator.rdivmod) + + @unpack_zerodim_and_defer("__pow__") + def __pow__(self, other): + return self._arith_method(other, operator.pow) + + @unpack_zerodim_and_defer("__rpow__") + def __rpow__(self, other): + return self._arith_method(other, roperator.rpow) + + +# ----------------------------------------------------------------------------- +# Helpers to implement __array_ufunc__ + + +def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any): + """ + Compatibility with numpy ufuncs. + + See also + -------- + numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ + """ + from pandas.core.frame import ( + DataFrame, + Series, + ) + from pandas.core.generic import NDFrame + from pandas.core.internals import ( + ArrayManager, + BlockManager, + ) + + cls = type(self) + + kwargs = _standardize_out_kwarg(**kwargs) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = ( + np.ndarray.__array_ufunc__, + cls.__array_ufunc__, + ) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + set_types = set(types) + if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types): + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs." + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1.union(ax2) + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if ufunc.nout > 1: + # np.modf, np.frexp, np.divmod + return tuple(_reconstruct(x) for x in result) + + return _reconstruct(result) + + def _reconstruct(result): + if lib.is_scalar(result): + return result + + if result.ndim != self.ndim: + if method == "outer": + raise NotImplementedError + return result + if isinstance(result, (BlockManager, ArrayManager)): + # we went through BlockManager.apply e.g. np.sqrt + result = self._constructor_from_mgr(result, axes=result.axes) + else: + # we converted an array, lost our axes + result = self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__finalize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + if len(alignable) == 1: + result = result.__finalize__(self) + return result + + if "out" in kwargs: + # e.g. test_multiindex_get_loc + result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs) + return reconstruct(result) + + if method == "reduce": + # e.g. test.series.test_ufunc.test_reduce + result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # We still get here with kwargs `axis` for e.g. np.maximum.accumulate + # and `dtype` and `keepdims` for np.ptp + + if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + + # e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add + inputs = tuple(np.asarray(x) for x in inputs) + # Note: we can't use default_array_ufunc here bc reindexing means + # that `self` may not be among `inputs` + result = getattr(ufunc, method)(*inputs, **kwargs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + if method == "__call__" and not kwargs: + # for np.(..) calls + # kwargs cannot necessarily be handled block-by-block, so only + # take this path if there are no kwargs + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + else: + # otherwise specific ufunc methods (eg np..accumulate(..)) + # Those can have an axis keyword and thus can't be called block-by-block + result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs) + # e.g. np.negative (only one reached), with "where" and "out" in kwargs + + result = reconstruct(result) + return result + + +def _standardize_out_kwarg(**kwargs) -> dict: + """ + If kwargs contain "out1" and "out2", replace that with a tuple "out" + + np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or + `out1=out1, out2=out2)` + """ + if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs: + out1 = kwargs.pop("out1") + out2 = kwargs.pop("out2") + out = (out1, out2) + kwargs["out"] = out + return kwargs + + +def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + """ + If we have an `out` keyword, then call the ufunc without `out` and then + set the result into the given `out`. + """ + + # Note: we assume _standardize_out_kwarg has already been called. + out = kwargs.pop("out") + where = kwargs.pop("where", None) + + result = getattr(ufunc, method)(*inputs, **kwargs) + + if result is NotImplemented: + return NotImplemented + + if isinstance(result, tuple): + # i.e. np.divmod, np.modf, np.frexp + if not isinstance(out, tuple) or len(out) != len(result): + raise NotImplementedError + + for arr, res in zip(out, result): + _assign_where(arr, res, where) + + return out + + if isinstance(out, tuple): + if len(out) == 1: + out = out[0] + else: + raise NotImplementedError + + _assign_where(out, result, where) + return out + + +def _assign_where(out, result, where) -> None: + """ + Set a ufunc result into 'out', masking with a 'where' argument if necessary. + """ + if where is None: + # no 'where' arg passed to ufunc + out[:] = result + else: + np.putmask(out, where, result) + + +def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + """ + Fallback to the behavior we would get if we did not define __array_ufunc__. + + Notes + ----- + We are assuming that `self` is among `inputs`. + """ + if not any(x is self for x in inputs): + raise NotImplementedError + + new_inputs = [x if x is not self else np.asarray(x) for x in inputs] + + return getattr(ufunc, method)(*new_inputs, **kwargs) + + +def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + """ + Dispatch ufunc reductions to self's reduction methods. + """ + assert method == "reduce" + + if len(inputs) != 1 or inputs[0] is not self: + return NotImplemented + + if ufunc.__name__ not in REDUCTION_ALIASES: + return NotImplemented + + method_name = REDUCTION_ALIASES[ufunc.__name__] + + # NB: we are assuming that min/max represent minimum/maximum methods, + # which would not be accurate for e.g. Timestamp.min + if not hasattr(self, method_name): + return NotImplemented + + if self.ndim > 1: + if isinstance(self, ABCNDFrame): + # TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA + kwargs["numeric_only"] = False + + if "axis" not in kwargs: + # For DataFrame reductions we don't want the default axis=0 + # Note: np.min is not a ufunc, but uses array_function_dispatch, + # so calls DataFrame.min (without ever getting here) with the np.min + # default of axis=None, which DataFrame.min catches and changes to axis=0. + # np.minimum.reduce(df) gets here bc axis is not in kwargs, + # so we set axis=0 to match the behaviorof np.minimum.reduce(df.values) + kwargs["axis"] = 0 + + # By default, numpy's reductions do not skip NaNs, so we have to + # pass skipna=False + return getattr(self, method_name)(skipna=False, **kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/base.py new file mode 100644 index 0000000000000000000000000000000000000000..e98f1157572bb91d72708e296702d9d4cd34eee8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/base.py @@ -0,0 +1,1391 @@ +""" +Base and utility classes for pandas objects. +""" + +from __future__ import annotations + +import textwrap +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + cast, + final, + overload, +) +import warnings + +import numpy as np + +from pandas._config import using_copy_on_write + +from pandas._libs import lib +from pandas._typing import ( + AxisInt, + DtypeObj, + IndexLabel, + NDFrameT, + Self, + Shape, + npt, +) +from pandas.compat import PYPY +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import can_hold_element +from pandas.core.dtypes.common import ( + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + isna, + remove_na_arraylike, +) + +from pandas.core import ( + algorithms, + nanops, + ops, +) +from pandas.core.accessor import DirNamesMixin +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ExtensionArray +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + ) + + from pandas._typing import ( + DropKeep, + NumpySorter, + NumpyValueArrayLike, + ScalarLike_co, + ) + + from pandas import ( + DataFrame, + Index, + Series, + ) + + +_shared_docs: dict[str, str] = {} +_indexops_doc_kwargs = { + "klass": "IndexOpsMixin", + "inplace": "", + "unique": "IndexOpsMixin", + "duplicated": "IndexOpsMixin", +} + + +class PandasObject(DirNamesMixin): + """ + Baseclass for various pandas objects. + """ + + # results from calls to methods decorated with cache_readonly get added to _cache + _cache: dict[str, Any] + + @property + def _constructor(self): + """ + Class constructor (for this class it's just `__class__`). + """ + return type(self) + + def __repr__(self) -> str: + """ + Return a string representation for a particular object. + """ + # Should be overwritten by base classes + return object.__repr__(self) + + def _reset_cache(self, key: str | None = None) -> None: + """ + Reset cached properties. If ``key`` is passed, only clears that key. + """ + if not hasattr(self, "_cache"): + return + if key is None: + self._cache.clear() + else: + self._cache.pop(key, None) + + def __sizeof__(self) -> int: + """ + Generates the total memory usage for an object that returns + either a value or Series of values + """ + memory_usage = getattr(self, "memory_usage", None) + if memory_usage: + mem = memory_usage(deep=True) # pylint: disable=not-callable + return int(mem if is_scalar(mem) else mem.sum()) + + # no memory_usage attribute, so fall back to object's 'sizeof' + return super().__sizeof__() + + +class NoNewAttributesMixin: + """ + Mixin which prevents adding new attributes. + + Prevents additional attributes via xxx.attribute = "something" after a + call to `self.__freeze()`. Mainly used to prevent the user from using + wrong attributes on an accessor (`Series.cat/.str/.dt`). + + If you really want to add a new attribute at a later time, you need to use + `object.__setattr__(self, key, value)`. + """ + + def _freeze(self) -> None: + """ + Prevents setting additional attributes. + """ + object.__setattr__(self, "__frozen", True) + + # prevent adding any attribute via s.xxx.new_attribute = ... + def __setattr__(self, key: str, value) -> None: + # _cache is used by a decorator + # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key) + # because + # 1.) getattr is false for attributes that raise errors + # 2.) cls.__dict__ doesn't traverse into base classes + if getattr(self, "__frozen", False) and not ( + key == "_cache" + or key in type(self).__dict__ + or getattr(self, key, None) is not None + ): + raise AttributeError(f"You cannot add any new attribute '{key}'") + object.__setattr__(self, key, value) + + +class SelectionMixin(Generic[NDFrameT]): + """ + mixin implementing the selection & aggregation interface on a group-like + object sub-classes need to define: obj, exclusions + """ + + obj: NDFrameT + _selection: IndexLabel | None = None + exclusions: frozenset[Hashable] + _internal_names = ["_cache", "__setstate__"] + _internal_names_set = set(_internal_names) + + @final + @property + def _selection_list(self): + if not isinstance( + self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray) + ): + return [self._selection] + return self._selection + + @cache_readonly + def _selected_obj(self): + if self._selection is None or isinstance(self.obj, ABCSeries): + return self.obj + else: + return self.obj[self._selection] + + @final + @cache_readonly + def ndim(self) -> int: + return self._selected_obj.ndim + + @final + @cache_readonly + def _obj_with_exclusions(self): + if isinstance(self.obj, ABCSeries): + return self.obj + + if self._selection is not None: + return self.obj._getitem_nocopy(self._selection_list) + + if len(self.exclusions) > 0: + # equivalent to `self.obj.drop(self.exclusions, axis=1) + # but this avoids consolidating and making a copy + # TODO: following GH#45287 can we now use .drop directly without + # making a copy? + return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True) + else: + return self.obj + + def __getitem__(self, key): + if self._selection is not None: + raise IndexError(f"Column(s) {self._selection} already selected") + + if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): + if len(self.obj.columns.intersection(key)) != len(set(key)): + bad_keys = list(set(key).difference(self.obj.columns)) + raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") + return self._gotitem(list(key), ndim=2) + + else: + if key not in self.obj: + raise KeyError(f"Column not found: {key}") + ndim = self.obj[key].ndim + return self._gotitem(key, ndim=ndim) + + def _gotitem(self, key, ndim: int, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : str / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + raise AbstractMethodError(self) + + @final + def _infer_selection(self, key, subset: Series | DataFrame): + """ + Infer the `selection` to pass to our constructor in _gotitem. + """ + # Shared by Rolling and Resample + selection = None + if subset.ndim == 2 and ( + (lib.is_scalar(key) and key in subset) or lib.is_list_like(key) + ): + selection = key + elif subset.ndim == 1 and lib.is_scalar(key) and key == subset.name: + selection = key + return selection + + def aggregate(self, func, *args, **kwargs): + raise AbstractMethodError(self) + + agg = aggregate + + +class IndexOpsMixin(OpsMixin): + """ + Common ops mixin to support a unified interface / docs for Series / Index + """ + + # ndarray compatibility + __array_priority__ = 1000 + _hidden_attrs: frozenset[str] = frozenset( + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ + ) + + @property + def dtype(self) -> DtypeObj: + # must be defined here as a property for mypy + raise AbstractMethodError(self) + + @property + def _values(self) -> ExtensionArray | np.ndarray: + # must be defined here as a property for mypy + raise AbstractMethodError(self) + + @final + def transpose(self, *args, **kwargs) -> Self: + """ + Return the transpose, which is by definition self. + + Returns + ------- + %(klass)s + """ + nv.validate_transpose(args, kwargs) + return self + + T = property( + transpose, + doc=""" + Return the transpose, which is by definition self. + + Examples + -------- + For Series: + + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: object + >>> s.T + 0 Ant + 1 Bear + 2 Cow + dtype: object + + For Index: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx.T + Index([1, 2, 3], dtype='int64') + """, + ) + + @property + def shape(self) -> Shape: + """ + Return a tuple of the shape of the underlying data. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.shape + (3,) + """ + return self._values.shape + + def __len__(self) -> int: + # We need this defined here for mypy + raise AbstractMethodError(self) + + @property + def ndim(self) -> Literal[1]: + """ + Number of dimensions of the underlying data, by definition 1. + + Examples + -------- + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: object + >>> s.ndim + 1 + + For Index: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.ndim + 1 + """ + return 1 + + @final + def item(self): + """ + Return the first element of the underlying data as a Python scalar. + + Returns + ------- + scalar + The first element of Series or Index. + + Raises + ------ + ValueError + If the data is not length = 1. + + Examples + -------- + >>> s = pd.Series([1]) + >>> s.item() + 1 + + For an index: + + >>> s = pd.Series([1], index=['a']) + >>> s.index.item() + 'a' + """ + if len(self) == 1: + return next(iter(self)) + raise ValueError("can only convert an array of size 1 to a Python scalar") + + @property + def nbytes(self) -> int: + """ + Return the number of bytes in the underlying data. + + Examples + -------- + For Series: + + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: object + >>> s.nbytes + 24 + + For Index: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.nbytes + 24 + """ + return self._values.nbytes + + @property + def size(self) -> int: + """ + Return the number of elements in the underlying data. + + Examples + -------- + For Series: + + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: object + >>> s.size + 3 + + For Index: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.size + 3 + """ + return len(self._values) + + @property + def array(self) -> ExtensionArray: + """ + The ExtensionArray of the data backing this Series or Index. + + Returns + ------- + ExtensionArray + An ExtensionArray of the values stored within. For extension + types, this is the actual array. For NumPy native types, this + is a thin (no copy) wrapper around :class:`numpy.ndarray`. + + ``.array`` differs from ``.values``, which may require converting + the data to a different form. + + See Also + -------- + Index.to_numpy : Similar method that always returns a NumPy array. + Series.to_numpy : Similar method that always returns a NumPy array. + + Notes + ----- + This table lays out the different array types for each extension + dtype within pandas. + + ================== ============================= + dtype array type + ================== ============================= + category Categorical + period PeriodArray + interval IntervalArray + IntegerNA IntegerArray + string StringArray + boolean BooleanArray + datetime64[ns, tz] DatetimeArray + ================== ============================= + + For any 3rd-party extension types, the array type will be an + ExtensionArray. + + For all remaining dtypes ``.array`` will be a + :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray + stored within. If you absolutely need a NumPy array (possibly with + copying / coercing data), then use :meth:`Series.to_numpy` instead. + + Examples + -------- + For regular NumPy types like int, and float, a NumpyExtensionArray + is returned. + + >>> pd.Series([1, 2, 3]).array + + [1, 2, 3] + Length: 3, dtype: int64 + + For extension types, like Categorical, the actual ExtensionArray + is returned + + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.array + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] + """ + raise AbstractMethodError(self) + + @final + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + **kwargs, + ) -> np.ndarray: + """ + A NumPy ndarray representing the values in this Series or Index. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array : Get the actual data stored within. + Index.array : Get the actual data stored within. + DataFrame.to_numpy : Similar method for DataFrame. + + Notes + ----- + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + For NumPy dtypes, this will be a reference to the actual data stored + in this Series or Index (assuming ``copy=False``). Modifying the result + in place will modify the data stored in the Series or Index (not that + we recommend doing that). + + For extension types, ``to_numpy()`` *may* require copying data and + coercing the result to a NumPy type (possibly object), which may be + expensive. When you need a no-copy reference to the underlying data, + :attr:`Series.array` should be used instead. + + This table lays out the different dtypes and default return types of + ``to_numpy()`` for various dtypes within pandas. + + ================== ================================ + dtype array type + ================== ================================ + category[T] ndarray[T] (same dtype as input) + period ndarray[object] (Periods) + interval ndarray[object] (Intervals) + IntegerNA ndarray[object] + datetime64[ns] datetime64[ns] + datetime64[ns, tz] ndarray[object] (Timestamps) + ================== ================================ + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.to_numpy() + array(['a', 'b', 'a'], dtype=object) + + Specify the `dtype` to control how datetime-aware data is represented. + Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` + objects, each with the correct ``tz``. + + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser.to_numpy(dtype=object) + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + dtype=object) + + Or ``dtype='datetime64[ns]'`` to return an ndarray of native + datetime64 values. The values are converted to UTC and the timezone + info is dropped. + + >>> ser.to_numpy(dtype="datetime64[ns]") + ... # doctest: +ELLIPSIS + array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], + dtype='datetime64[ns]') + """ + if isinstance(self.dtype, ExtensionDtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + elif kwargs: + bad_keys = next(iter(kwargs.keys())) + raise TypeError( + f"to_numpy() got an unexpected keyword argument '{bad_keys}'" + ) + + fillna = ( + na_value is not lib.no_default + # no need to fillna with np.nan if we already have a float dtype + and not (na_value is np.nan and np.issubdtype(self.dtype, np.floating)) + ) + + values = self._values + if fillna: + if not can_hold_element(values, na_value): + # if we can't hold the na_value asarray either makes a copy or we + # error before modifying values. The asarray later on thus won't make + # another copy + values = np.asarray(values, dtype=dtype) + else: + values = values.copy() + + values[np.asanyarray(isna(self))] = na_value + + result = np.asarray(values, dtype=dtype) + + if (copy and not fillna) or (not copy and using_copy_on_write()): + if np.shares_memory(self._values[:2], result[:2]): + # Take slices to improve performance of check + if using_copy_on_write() and not copy: + result = result.view() + result.flags.writeable = False + else: + result = result.copy() + + return result + + @final + @property + def empty(self) -> bool: + return not self.size + + @doc(op="max", oppose="min", value="largest") + def argmax( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: + """ + Return int position of the {value} value in the Series. + + If the {op}imum is achieved in multiple locations, + the first row position is returned. + + Parameters + ---------- + axis : {{None}} + Unused. Parameter needed for compatibility with DataFrame. + skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. + + Returns + ------- + int + Row position of the {op}imum value. + + See Also + -------- + Series.arg{op} : Return position of the {op}imum value. + Series.arg{oppose} : Return position of the {oppose}imum value. + numpy.ndarray.arg{op} : Equivalent method for numpy arrays. + Series.idxmax : Return index label of the maximum values. + Series.idxmin : Return index label of the minimum values. + + Examples + -------- + Consider dataset containing cereal calories + + >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, + ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) + >>> s + Corn Flakes 100.0 + Almond Delight 110.0 + Cinnamon Toast Crunch 120.0 + Cocoa Puff 110.0 + dtype: float64 + + >>> s.argmax() + 2 + >>> s.argmin() + 0 + + The maximum cereal calories is the third element and + the minimum cereal calories is the first element, + since series is zero-indexed. + """ + delegate = self._values + nv.validate_minmax_axis(axis) + skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) + + if isinstance(delegate, ExtensionArray): + if not skipna and delegate.isna().any(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + else: + return delegate.argmax() + else: + result = nanops.nanargmax(delegate, skipna=skipna) + if result == -1: + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # error: Incompatible return value type (got "Union[int, ndarray]", expected + # "int") + return result # type: ignore[return-value] + + @doc(argmax, op="min", oppose="max", value="smallest") + def argmin( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: + delegate = self._values + nv.validate_minmax_axis(axis) + skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) + + if isinstance(delegate, ExtensionArray): + if not skipna and delegate.isna().any(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + else: + return delegate.argmin() + else: + result = nanops.nanargmin(delegate, skipna=skipna) + if result == -1: + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # error: Incompatible return value type (got "Union[int, ndarray]", expected + # "int") + return result # type: ignore[return-value] + + def tolist(self): + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + + See Also + -------- + numpy.ndarray.tolist : Return the array as an a.ndim-levels deep + nested list of Python scalars. + + Examples + -------- + For Series + + >>> s = pd.Series([1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + + For Index: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + + >>> idx.to_list() + [1, 2, 3] + """ + return self._values.tolist() + + to_list = tolist + + def __iter__(self) -> Iterator: + """ + Return an iterator of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + iterator + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> for x in s: + ... print(x) + 1 + 2 + 3 + """ + # We are explicitly making element iterators. + if not isinstance(self._values, np.ndarray): + # Check type instead of dtype to catch DTA/TDA + return iter(self._values) + else: + return map(self._values.item, range(self._values.size)) + + @cache_readonly + def hasnans(self) -> bool: + """ + Return True if there are any NaNs. + + Enables various performance speedups. + + Returns + ------- + bool + + Examples + -------- + >>> s = pd.Series([1, 2, 3, None]) + >>> s + 0 1.0 + 1 2.0 + 2 3.0 + 3 NaN + dtype: float64 + >>> s.hasnans + True + """ + # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]" + # has no attribute "any" + return bool(isna(self).any()) # type: ignore[union-attr] + + @final + def _map_values(self, mapper, na_action=None, convert: bool = True): + """ + An internal function that maps values using the input + correspondence (which can be a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + The input correspondence object + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function + convert : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. Note that the dtype is always + preserved for some extension array dtypes, such as Categorical. + + Returns + ------- + Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + arr = self._values + + if isinstance(arr, ExtensionArray): + return arr.map(mapper, na_action=na_action) + + return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert) + + @final + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series: + """ + Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : bool, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : bool, default True + Sort by frequencies when True. Preserve the order of the data when False. + ascending : bool, default False + Sort in ascending order. + bins : int, optional + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with numeric data. + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + Series + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.count: Number of non-NA elements in a DataFrame. + DataFrame.value_counts: Equivalent method on DataFrames. + + Examples + -------- + >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) + >>> index.value_counts() + 3.0 2 + 1.0 1 + 2.0 1 + 4.0 1 + Name: count, dtype: int64 + + With `normalize` set to `True`, returns the relative frequency by + dividing all values by the sum of values. + + >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) + >>> s.value_counts(normalize=True) + 3.0 0.4 + 1.0 0.2 + 2.0 0.2 + 4.0 0.2 + Name: proportion, dtype: float64 + + **bins** + + Bins can be useful for going from a continuous variable to a + categorical variable; instead of counting unique + apparitions of values, divide the index in the specified + number of half-open bins. + + >>> s.value_counts(bins=3) + (0.996, 2.0] 2 + (2.0, 3.0] 2 + (3.0, 4.0] 1 + Name: count, dtype: int64 + + **dropna** + + With `dropna` set to `False` we can also see NaN index values. + + >>> s.value_counts(dropna=False) + 3.0 2 + 1.0 1 + 2.0 1 + 4.0 1 + NaN 1 + Name: count, dtype: int64 + """ + return algorithms.value_counts_internal( + self, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, + ) + + def unique(self): + values = self._values + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + result = values.unique() + else: + result = algorithms.unique1d(values) + return result + + @final + def nunique(self, dropna: bool = True) -> int: + """ + Return number of unique elements in the object. + + Excludes NA values by default. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the count. + + Returns + ------- + int + + See Also + -------- + DataFrame.nunique: Method nunique for DataFrame. + Series.count: Count non-NA/null observations in the Series. + + Examples + -------- + >>> s = pd.Series([1, 3, 5, 7, 7]) + >>> s + 0 1 + 1 3 + 2 5 + 3 7 + 4 7 + dtype: int64 + + >>> s.nunique() + 4 + """ + uniqs = self.unique() + if dropna: + uniqs = remove_na_arraylike(uniqs) + return len(uniqs) + + @property + def is_unique(self) -> bool: + """ + Return boolean if values in the object are unique. + + Returns + ------- + bool + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.is_unique + True + + >>> s = pd.Series([1, 2, 3, 1]) + >>> s.is_unique + False + """ + return self.nunique(dropna=False) == len(self) + + @property + def is_monotonic_increasing(self) -> bool: + """ + Return boolean if values in the object are monotonically increasing. + + Returns + ------- + bool + + Examples + -------- + >>> s = pd.Series([1, 2, 2]) + >>> s.is_monotonic_increasing + True + + >>> s = pd.Series([3, 2, 1]) + >>> s.is_monotonic_increasing + False + """ + from pandas import Index + + return Index(self).is_monotonic_increasing + + @property + def is_monotonic_decreasing(self) -> bool: + """ + Return boolean if values in the object are monotonically decreasing. + + Returns + ------- + bool + + Examples + -------- + >>> s = pd.Series([3, 2, 2, 1]) + >>> s.is_monotonic_decreasing + True + + >>> s = pd.Series([1, 2, 3]) + >>> s.is_monotonic_decreasing + False + """ + from pandas import Index + + return Index(self).is_monotonic_decreasing + + @final + def _memory_usage(self, deep: bool = False) -> int: + """ + Memory usage of the values. + + Parameters + ---------- + deep : bool, default False + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption. + + Returns + ------- + bytes used + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False or if used on PyPy + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx.memory_usage() + 24 + """ + if hasattr(self.array, "memory_usage"): + return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues] + deep=deep, + ) + + v = self.array.nbytes + if deep and is_object_dtype(self.dtype) and not PYPY: + values = cast(np.ndarray, self._values) + v += lib.memory_usage_of_objects(values) + return v + + @doc( + algorithms.factorize, + values="", + order="", + size_hint="", + sort=textwrap.dedent( + """\ + sort : bool, default False + Sort `uniques` and shuffle `codes` to maintain the + relationship. + """ + ), + ) + def factorize( + self, + sort: bool = False, + use_na_sentinel: bool = True, + ) -> tuple[npt.NDArray[np.intp], Index]: + codes, uniques = algorithms.factorize( + self._values, sort=sort, use_na_sentinel=use_na_sentinel + ) + if uniques.dtype == np.float16: + uniques = uniques.astype(np.float32) + + if isinstance(self, ABCIndex): + # preserve e.g. MultiIndex + uniques = self._constructor(uniques) + else: + from pandas import Index + + uniques = Index(uniques) + return codes, uniques + + _shared_docs[ + "searchsorted" + ] = """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted {klass} `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + .. note:: + + The {klass} *must* be monotonically sorted, otherwise + wrong locations will likely be returned. Pandas does *not* + check this for you. + + Parameters + ---------- + value : array-like or scalar + Values to insert into `self`. + side : {{'left', 'right'}}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array-like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + int or array of int + A scalar or array of insertion points with the + same shape as `value`. + + See Also + -------- + sort_values : Sort by the values along either axis. + numpy.searchsorted : Similar method from NumPy. + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]) + >>> ser + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> ser.searchsorted(4) + 3 + + >>> ser.searchsorted([0, 4]) + array([0, 3]) + + >>> ser.searchsorted([1, 3], side='left') + array([0, 2]) + + >>> ser.searchsorted([1, 3], side='right') + array([1, 3]) + + >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'])) + >>> ser + 0 2000-03-11 + 1 2000-03-12 + 2 2000-03-13 + dtype: datetime64[ns] + + >>> ser.searchsorted('3/14/2000') + 3 + + >>> ser = pd.Categorical( + ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True + ... ) + >>> ser + ['apple', 'bread', 'bread', 'cheese', 'milk'] + Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] + + >>> ser.searchsorted('bread') + 1 + + >>> ser.searchsorted(['bread'], side='right') + array([3]) + + If the values are not monotonically sorted, wrong locations + may be returned: + + >>> ser = pd.Series([2, 1, 3]) + >>> ser + 0 2 + 1 1 + 2 3 + dtype: int64 + + >>> ser.searchsorted(1) # doctest: +SKIP + 0 # wrong result, correct would be 1 + """ + + # This overload is needed so that the call to searchsorted in + # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result + + # error: Overloaded function signatures 1 and 2 overlap with incompatible + # return types + @overload + def searchsorted( # type: ignore[overload-overlap] + self, + value: ScalarLike_co, + side: Literal["left", "right"] = ..., + sorter: NumpySorter = ..., + ) -> np.intp: + ... + + @overload + def searchsorted( + self, + value: npt.ArrayLike | ExtensionArray, + side: Literal["left", "right"] = ..., + sorter: NumpySorter = ..., + ) -> npt.NDArray[np.intp]: + ... + + @doc(_shared_docs["searchsorted"], klass="Index") + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if isinstance(value, ABCDataFrame): + msg = ( + "Value must be 1-D array-like or scalar, " + f"{type(value).__name__} is not supported" + ) + raise ValueError(msg) + + values = self._values + if not isinstance(values, np.ndarray): + # Going through EA.searchsorted directly improves performance GH#38083 + return values.searchsorted(value, side=side, sorter=sorter) + + return algorithms.searchsorted( + values, + value, + side=side, + sorter=sorter, + ) + + def drop_duplicates(self, *, keep: DropKeep = "first"): + duplicated = self._duplicated(keep=keep) + # error: Value of type "IndexOpsMixin" is not indexable + return self[~duplicated] # type: ignore[index] + + @final + def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) + + def _arith_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + + lvalues = self._values + rvalues = extract_array(other, extract_numpy=True, extract_range=True) + rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape) + rvalues = ensure_wrapped_if_datetimelike(rvalues) + if isinstance(rvalues, range): + rvalues = np.arange(rvalues.start, rvalues.stop, rvalues.step) + + with np.errstate(all="ignore"): + result = ops.arithmetic_op(lvalues, rvalues, op) + + return self._construct_result(result, name=res_name) + + def _construct_result(self, result, name): + """ + Construct an appropriately-wrapped result from the ArrayLike result + of an arithmetic-like operation. + """ + raise AbstractMethodError(self) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/common.py new file mode 100644 index 0000000000000000000000000000000000000000..9f024498d66edfe9cab7b6fb2e0802da79163786 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/common.py @@ -0,0 +1,657 @@ +""" +Misc tools for implementing data structures + +Note: pandas.core.common is *not* part of the public API. +""" +from __future__ import annotations + +import builtins +from collections import ( + abc, + defaultdict, +) +from collections.abc import ( + Collection, + Generator, + Hashable, + Iterable, + Sequence, +) +import contextlib +from functools import partial +import inspect +from typing import ( + TYPE_CHECKING, + Any, + Callable, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.compat.numpy import np_version_gte1p24 + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer, +) +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.inference import iterable_not_string + +if TYPE_CHECKING: + from pandas._typing import ( + AnyArrayLike, + ArrayLike, + NpDtype, + RandomState, + T, + ) + + from pandas import Index + + +def flatten(line): + """ + Flatten an arbitrarily nested sequence. + + Parameters + ---------- + line : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for element in line: + if iterable_not_string(element): + yield from flatten(element) + else: + yield element + + +def consensus_name_attr(objs): + name = objs[0].name + for obj in objs[1:]: + try: + if obj.name != name: + name = None + except ValueError: + name = None + return name + + +def is_bool_indexer(key: Any) -> bool: + """ + Check whether `key` is a valid boolean indexer. + + Parameters + ---------- + key : Any + Only list-likes may be considered boolean indexers. + All other types are not considered a boolean indexer. + For array-like input, boolean ndarrays or ExtensionArrays + with ``_is_boolean`` set are considered boolean indexers. + + Returns + ------- + bool + Whether `key` is a valid boolean indexer. + + Raises + ------ + ValueError + When the array is an object-dtype ndarray or ExtensionArray + and contains missing values. + + See Also + -------- + check_array_indexer : Check that `key` is a valid array to index, + and convert to an ndarray. + """ + if isinstance( + key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray) + ) and not isinstance(key, ABCMultiIndex): + if key.dtype == np.object_: + key_array = np.asarray(key) + + if not lib.is_bool_array(key_array): + na_msg = "Cannot mask with non-boolean array containing NA / NaN values" + if lib.is_bool_array(key_array, skipna=True): + # Don't raise on e.g. ["A", "B", np.nan], see + # test_loc_getitem_list_of_labels_categoricalindex_with_na + raise ValueError(na_msg) + return False + return True + elif is_bool_dtype(key.dtype): + return True + elif isinstance(key, list): + # check if np.array(key).dtype would be bool + if len(key) > 0: + if type(key) is not list: # noqa: E721 + # GH#42461 cython will raise TypeError if we pass a subclass + key = list(key) + return lib.is_bool_list(key) + + return False + + +def cast_scalar_indexer(val): + """ + Disallow indexing with a float key, even if that key is a round number. + + Parameters + ---------- + val : scalar + + Returns + ------- + outval : scalar + """ + # assumes lib.is_scalar(val) + if lib.is_float(val) and val.is_integer(): + raise IndexError( + # GH#34193 + "Indexing with a float is no longer supported. Manually convert " + "to an integer key instead." + ) + return val + + +def not_none(*args): + """ + Returns a generator consisting of the arguments that are not None. + """ + return (arg for arg in args if arg is not None) + + +def any_none(*args) -> bool: + """ + Returns a boolean indicating if any argument is None. + """ + return any(arg is None for arg in args) + + +def all_none(*args) -> bool: + """ + Returns a boolean indicating if all arguments are None. + """ + return all(arg is None for arg in args) + + +def any_not_none(*args) -> bool: + """ + Returns a boolean indicating if any argument is not None. + """ + return any(arg is not None for arg in args) + + +def all_not_none(*args) -> bool: + """ + Returns a boolean indicating if all arguments are not None. + """ + return all(arg is not None for arg in args) + + +def count_not_none(*args) -> int: + """ + Returns the count of arguments that are not None. + """ + return sum(x is not None for x in args) + + +@overload +def asarray_tuplesafe( + values: ArrayLike | list | tuple | zip, dtype: NpDtype | None = ... +) -> np.ndarray: + # ExtensionArray can only be returned when values is an Index, all other iterables + # will return np.ndarray. Unfortunately "all other" cannot be encoded in a type + # signature, so instead we special-case some common types. + ... + + +@overload +def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike: + ... + + +def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike: + if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): + values = list(values) + elif isinstance(values, ABCIndex): + return values._values + elif isinstance(values, ABCSeries): + return values._values + + if isinstance(values, list) and dtype in [np.object_, object]: + return construct_1d_object_array_from_listlike(values) + + try: + with warnings.catch_warnings(): + # Can remove warning filter once NumPy 1.24 is min version + if not np_version_gte1p24: + warnings.simplefilter("ignore", np.VisibleDeprecationWarning) + result = np.asarray(values, dtype=dtype) + except ValueError: + # Using try/except since it's more performant than checking is_list_like + # over each element + # error: Argument 1 to "construct_1d_object_array_from_listlike" + # has incompatible type "Iterable[Any]"; expected "Sized" + return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type] + + if issubclass(result.dtype.type, str): + result = np.asarray(values, dtype=object) + + if result.ndim == 2: + # Avoid building an array of arrays: + values = [tuple(x) for x in values] + result = construct_1d_object_array_from_listlike(values) + + return result + + +def index_labels_to_array( + labels: np.ndarray | Iterable, dtype: NpDtype | None = None +) -> np.ndarray: + """ + Transform label or iterable of labels to array, for use in Index. + + Parameters + ---------- + dtype : dtype + If specified, use as dtype of the resulting array, otherwise infer. + + Returns + ------- + array + """ + if isinstance(labels, (str, tuple)): + labels = [labels] + + if not isinstance(labels, (list, np.ndarray)): + try: + labels = list(labels) + except TypeError: # non-iterable + labels = [labels] + + labels = asarray_tuplesafe(labels, dtype=dtype) + + return labels + + +def maybe_make_list(obj): + if obj is not None and not isinstance(obj, (tuple, list)): + return [obj] + return obj + + +def maybe_iterable_to_list(obj: Iterable[T] | T) -> Collection[T] | T: + """ + If obj is Iterable but not list-like, consume into list. + """ + if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): + return list(obj) + obj = cast(Collection, obj) + return obj + + +def is_null_slice(obj) -> bool: + """ + We have a null slice. + """ + return ( + isinstance(obj, slice) + and obj.start is None + and obj.stop is None + and obj.step is None + ) + + +def is_empty_slice(obj) -> bool: + """ + We have an empty slice, e.g. no values are selected. + """ + return ( + isinstance(obj, slice) + and obj.start is not None + and obj.stop is not None + and obj.start == obj.stop + ) + + +def is_true_slices(line) -> list[bool]: + """ + Find non-trivial slices in "line": return a list of booleans with same length. + """ + return [isinstance(k, slice) and not is_null_slice(k) for k in line] + + +# TODO: used only once in indexing; belongs elsewhere? +def is_full_slice(obj, line: int) -> bool: + """ + We have a full length slice. + """ + return ( + isinstance(obj, slice) + and obj.start == 0 + and obj.stop == line + and obj.step is None + ) + + +def get_callable_name(obj): + # typical case has name + if hasattr(obj, "__name__"): + return getattr(obj, "__name__") + # some objects don't; could recurse + if isinstance(obj, partial): + return get_callable_name(obj.func) + # fall back to class name + if callable(obj): + return type(obj).__name__ + # everything failed (probably because the argument + # wasn't actually callable); we return None + # instead of the empty string in this case to allow + # distinguishing between no name and a name of '' + return None + + +def apply_if_callable(maybe_callable, obj, **kwargs): + """ + Evaluate possibly callable input using obj and kwargs if it is callable, + otherwise return as it is. + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + **kwargs + """ + if callable(maybe_callable): + return maybe_callable(obj, **kwargs) + + return maybe_callable + + +def standardize_mapping(into): + """ + Helper function to standardize a supplied mapping. + + Parameters + ---------- + into : instance or subclass of collections.abc.Mapping + Must be a class, an initialized collections.defaultdict, + or an instance of a collections.abc.Mapping subclass. + + Returns + ------- + mapping : a collections.abc.Mapping subclass or other constructor + a callable object that can accept an iterator to create + the desired Mapping. + + See Also + -------- + DataFrame.to_dict + Series.to_dict + """ + if not inspect.isclass(into): + if isinstance(into, defaultdict): + return partial(defaultdict, into.default_factory) + into = type(into) + if not issubclass(into, abc.Mapping): + raise TypeError(f"unsupported type: {into}") + if into == defaultdict: + raise TypeError("to_dict() only accepts initialized defaultdicts") + return into + + +@overload +def random_state(state: np.random.Generator) -> np.random.Generator: + ... + + +@overload +def random_state( + state: int | np.ndarray | np.random.BitGenerator | np.random.RandomState | None, +) -> np.random.RandomState: + ... + + +def random_state(state: RandomState | None = None): + """ + Helper function for processing random_state arguments. + + Parameters + ---------- + state : int, array-like, BitGenerator, Generator, np.random.RandomState, None. + If receives an int, array-like, or BitGenerator, passes to + np.random.RandomState() as seed. + If receives an np.random RandomState or Generator, just returns that unchanged. + If receives `None`, returns np.random. + If receives anything else, raises an informative ValueError. + + Default None. + + Returns + ------- + np.random.RandomState or np.random.Generator. If state is None, returns np.random + + """ + if is_integer(state) or isinstance(state, (np.ndarray, np.random.BitGenerator)): + return np.random.RandomState(state) + elif isinstance(state, np.random.RandomState): + return state + elif isinstance(state, np.random.Generator): + return state + elif state is None: + return np.random + else: + raise ValueError( + "random_state must be an integer, array-like, a BitGenerator, Generator, " + "a numpy RandomState, or None" + ) + + +def pipe( + obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs +) -> T: + """ + Apply a function ``func`` to object ``obj`` either by passing obj as the + first argument to the function or, in the case that the func is a tuple, + interpret the first element of the tuple as a function and pass the obj to + that function as a keyword argument whose key is the value of the second + element of the tuple. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of ``callable`` that expects the + object. + *args : iterable, optional + Positional arguments passed into ``func``. + **kwargs : dict, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = f"{target} is both the pipe target and a keyword argument" + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs) + + +def get_rename_function(mapper): + """ + Returns a function that will map names/labels, dependent if mapper + is a dict, Series or just a function. + """ + + def f(x): + if x in mapper: + return mapper[x] + else: + return x + + return f if isinstance(mapper, (abc.Mapping, ABCSeries)) else mapper + + +def convert_to_list_like( + values: Hashable | Iterable | AnyArrayLike, +) -> list | AnyArrayLike: + """ + Convert list-like or scalar input to list-like. List, numpy and pandas array-like + inputs are returned unmodified whereas others are converted to list. + """ + if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): + return values + elif isinstance(values, abc.Iterable) and not isinstance(values, str): + return list(values) + + return [values] + + +@contextlib.contextmanager +def temp_setattr( + obj, attr: str, value, condition: bool = True +) -> Generator[None, None, None]: + """ + Temporarily set attribute on an object. + + Parameters + ---------- + obj : object + Object whose attribute will be modified. + attr : str + Attribute to modify. + value : Any + Value to temporarily set attribute to. + condition : bool, default True + Whether to set the attribute. Provided in order to not have to + conditionally use this context manager. + + Yields + ------ + object : obj with modified attribute. + """ + if condition: + old_value = getattr(obj, attr) + setattr(obj, attr, value) + try: + yield obj + finally: + if condition: + setattr(obj, attr, old_value) + + +def require_length_match(data, index: Index) -> None: + """ + Check the length of data matches the length of the index. + """ + if len(data) != len(index): + raise ValueError( + "Length of values " + f"({len(data)}) " + "does not match length of index " + f"({len(index)})" + ) + + +# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0, +# whereas np.min and np.max (which directly call obj.min and obj.max) +# default to axis=None. +_builtin_table = { + builtins.sum: np.sum, + builtins.max: np.maximum.reduce, + builtins.min: np.minimum.reduce, +} + +# GH#53425: Only for deprecation +_builtin_table_alias = { + builtins.sum: "np.sum", + builtins.max: "np.maximum.reduce", + builtins.min: "np.minimum.reduce", +} + +_cython_table = { + builtins.sum: "sum", + builtins.max: "max", + builtins.min: "min", + np.all: "all", + np.any: "any", + np.sum: "sum", + np.nansum: "sum", + np.mean: "mean", + np.nanmean: "mean", + np.prod: "prod", + np.nanprod: "prod", + np.std: "std", + np.nanstd: "std", + np.var: "var", + np.nanvar: "var", + np.median: "median", + np.nanmedian: "median", + np.max: "max", + np.nanmax: "max", + np.min: "min", + np.nanmin: "min", + np.cumprod: "cumprod", + np.nancumprod: "cumprod", + np.cumsum: "cumsum", + np.nancumsum: "cumsum", +} + + +def get_cython_func(arg: Callable) -> str | None: + """ + if we define an internal function for this argument, return it + """ + return _cython_table.get(arg) + + +def is_builtin_func(arg): + """ + if we define a builtin function for this argument, return it, + otherwise return the arg + """ + return _builtin_table.get(arg, arg) + + +def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: + """ + If a name is missing then replace it by level_n, where n is the count + + .. versionadded:: 1.4.0 + + Parameters + ---------- + names : list-like + list of column names or None values. + + Returns + ------- + list + list of column names with the None values replaced. + """ + return [f"level_{i}" if name is None else name for i, name in enumerate(names)] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/config_init.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/config_init.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b63f97141c22907726b477e1197ffae2a57853 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/config_init.py @@ -0,0 +1,924 @@ +""" +This module is imported from the pandas package __init__.py file +in order to ensure that the core.config options registered here will +be available as soon as the user loads the package. if register_option +is invoked inside specific modules, they will not be registered until that +module is imported, which may or may not be a problem. + +If you need to make sure options are available even before a certain +module is imported, register them here rather than in the module. + +""" +from __future__ import annotations + +import os +from typing import Callable + +import pandas._config.config as cf +from pandas._config.config import ( + is_bool, + is_callable, + is_instance_factory, + is_int, + is_nonnegative_int, + is_one_of_factory, + is_str, + is_text, +) + +# compute + +use_bottleneck_doc = """ +: bool + Use the bottleneck library to accelerate if it is installed, + the default is True + Valid values: False,True +""" + + +def use_bottleneck_cb(key) -> None: + from pandas.core import nanops + + nanops.set_use_bottleneck(cf.get_option(key)) + + +use_numexpr_doc = """ +: bool + Use the numexpr library to accelerate computation if it is installed, + the default is True + Valid values: False,True +""" + + +def use_numexpr_cb(key) -> None: + from pandas.core.computation import expressions + + expressions.set_use_numexpr(cf.get_option(key)) + + +use_numba_doc = """ +: bool + Use the numba engine option for select operations if it is installed, + the default is False + Valid values: False,True +""" + + +def use_numba_cb(key) -> None: + from pandas.core.util import numba_ + + numba_.set_use_numba(cf.get_option(key)) + + +with cf.config_prefix("compute"): + cf.register_option( + "use_bottleneck", + True, + use_bottleneck_doc, + validator=is_bool, + cb=use_bottleneck_cb, + ) + cf.register_option( + "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb + ) + cf.register_option( + "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb + ) +# +# options from the "display" namespace + +pc_precision_doc = """ +: int + Floating point output precision in terms of number of places after the + decimal, for regular formatting as well as scientific notation. Similar + to ``precision`` in :meth:`numpy.set_printoptions`. +""" + +pc_colspace_doc = """ +: int + Default space for DataFrame columns. +""" + +pc_max_rows_doc = """ +: int + If max_rows is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 and pandas will auto-detect + the height of the terminal and print a truncated object which fits + the screen height. The IPython notebook, IPython qtconsole, or + IDLE do not run in a terminal and hence it is not possible to do + correct auto-detection. +""" + +pc_min_rows_doc = """ +: int + The numbers of rows to show in a truncated view (when `max_rows` is + exceeded). Ignored when `max_rows` is set to None or 0. When set to + None, follows the value of `max_rows`. +""" + +pc_max_cols_doc = """ +: int + If max_cols is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 or None and pandas will auto-detect + the width of the terminal and print a truncated object which fits + the screen width. The IPython notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence it is not possible to do + correct auto-detection and defaults to 20. +""" + +pc_max_categories_doc = """ +: int + This sets the maximum number of categories pandas should output when + printing out a `Categorical` or a Series of dtype "category". +""" + +pc_max_info_cols_doc = """ +: int + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. +""" + +pc_nb_repr_h_doc = """ +: boolean + When True, IPython notebook will use html representation for + pandas objects (if it is available). +""" + +pc_pprint_nest_depth = """ +: int + Controls the number of nested levels to process when pretty-printing +""" + +pc_multi_sparse_doc = """ +: boolean + "sparsify" MultiIndex display (don't display repeated + elements in outer levels within groups) +""" + +float_format_doc = """ +: callable + The callable should accept a floating point number and return + a string with the desired format of the number. This is used + in some places like SeriesFormatter. + See formats.format.EngFormatter for an example. +""" + +max_colwidth_doc = """ +: int or None + The maximum width in characters of a column in the repr of + a pandas data structure. When the column overflows, a "..." + placeholder is embedded in the output. A 'None' value means unlimited. +""" + +colheader_justify_doc = """ +: 'left'/'right' + Controls the justification of column headers. used by DataFrameFormatter. +""" + +pc_expand_repr_doc = """ +: boolean + Whether to print out the full DataFrame repr for wide DataFrames across + multiple lines, `max_columns` is still respected, but the output will + wrap-around across multiple "pages" if its width exceeds `display.width`. +""" + +pc_show_dimensions_doc = """ +: boolean or 'truncate' + Whether to print out dimensions at the end of DataFrame repr. + If 'truncate' is specified, only print out the dimensions if the + frame is truncated (e.g. not display all rows and/or columns) +""" + +pc_east_asian_width_doc = """ +: boolean + Whether to use the Unicode East Asian Width to calculate the display text + width. + Enabling this may affect to the performance (default: False) +""" + +pc_ambiguous_as_wide_doc = """ +: boolean + Whether to handle Unicode characters belong to Ambiguous as Wide (width=2) + (default: False) +""" + +pc_table_schema_doc = """ +: boolean + Whether to publish a Table Schema representation for frontends + that support it. + (default: False) +""" + +pc_html_border_doc = """ +: int + A ``border=value`` attribute is inserted in the ```` tag + for the DataFrame HTML repr. +""" + +pc_html_use_mathjax_doc = """\ +: boolean + When True, Jupyter notebook will process table contents using MathJax, + rendering mathematical expressions enclosed by the dollar symbol. + (default: True) +""" + +pc_max_dir_items = """\ +: int + The number of items that will be added to `dir(...)`. 'None' value means + unlimited. Because dir is cached, changing this option will not immediately + affect already existing dataframes until a column is deleted or added. + + This is for instance used to suggest columns from a dataframe to tab + completion. +""" + +pc_width_doc = """ +: int + Width of the display in characters. In case python/IPython is running in + a terminal this can be set to None and pandas will correctly auto-detect + the width. + Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible to correctly detect the width. +""" + +pc_chop_threshold_doc = """ +: float or None + if set to a float value, all float values smaller than the given threshold + will be displayed as exactly 0 by repr and friends. +""" + +pc_max_seq_items = """ +: int or None + When pretty-printing a long sequence, no more then `max_seq_items` + will be printed. If items are omitted, they will be denoted by the + addition of "..." to the resulting string. + + If set to None, the number of items to be printed is unlimited. +""" + +pc_max_info_rows_doc = """ +: int + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions than + specified. +""" + +pc_large_repr_doc = """ +: 'truncate'/'info' + For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can + show a truncated table, or switch to the view from + df.info() (the behaviour in earlier versions of pandas). +""" + +pc_memory_usage_doc = """ +: bool, string or None + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. Valid values True,False,'deep' +""" + + +def table_schema_cb(key) -> None: + from pandas.io.formats.printing import enable_data_resource_formatter + + enable_data_resource_formatter(cf.get_option(key)) + + +def is_terminal() -> bool: + """ + Detect if Python is running in a terminal. + + Returns True if Python is running in a terminal or False if not. + """ + try: + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore[name-defined] + except NameError: # assume standard Python interpreter in a terminal + return True + else: + if hasattr(ip, "kernel"): # IPython as a Jupyter kernel + return False + else: # IPython in a terminal + return True + + +with cf.config_prefix("display"): + cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int) + cf.register_option( + "float_format", + None, + float_format_doc, + validator=is_one_of_factory([None, is_callable]), + ) + cf.register_option( + "max_info_rows", + 1690785, + pc_max_info_rows_doc, + validator=is_int, + ) + cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) + cf.register_option( + "min_rows", + 10, + pc_min_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) + + cf.register_option( + "max_colwidth", + 50, + max_colwidth_doc, + validator=is_nonnegative_int, + ) + if is_terminal(): + max_cols = 0 # automatically determine optimal number of columns + else: + max_cols = 20 # cannot determine optimal number of columns + cf.register_option( + "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int + ) + cf.register_option( + "large_repr", + "truncate", + pc_large_repr_doc, + validator=is_one_of_factory(["truncate", "info"]), + ) + cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int) + cf.register_option( + "colheader_justify", "right", colheader_justify_doc, validator=is_text + ) + cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool) + cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int) + cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool) + cf.register_option("expand_frame_repr", True, pc_expand_repr_doc) + cf.register_option( + "show_dimensions", + "truncate", + pc_show_dimensions_doc, + validator=is_one_of_factory([True, False, "truncate"]), + ) + cf.register_option("chop_threshold", None, pc_chop_threshold_doc) + cf.register_option("max_seq_items", 100, pc_max_seq_items) + cf.register_option( + "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int]) + ) + cf.register_option( + "memory_usage", + True, + pc_memory_usage_doc, + validator=is_one_of_factory([None, True, False, "deep"]), + ) + cf.register_option( + "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option( + "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option( + "html.table_schema", + False, + pc_table_schema_doc, + validator=is_bool, + cb=table_schema_cb, + ) + cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int) + cf.register_option( + "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool + ) + cf.register_option( + "max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int + ) + +tc_sim_interactive_doc = """ +: boolean + Whether to simulate interactive mode for purposes of testing +""" + +with cf.config_prefix("mode"): + cf.register_option("sim_interactive", False, tc_sim_interactive_doc) + +use_inf_as_na_doc = """ +: boolean + True means treat None, NaN, INF, -INF as NA (old way), + False means None and NaN are null, but INF, -INF are not NA + (new way). + + This option is deprecated in pandas 2.1.0 and will be removed in 3.0. +""" + +# We don't want to start importing everything at the global context level +# or we'll hit circular deps. + + +def use_inf_as_na_cb(key) -> None: + # TODO(3.0): enforcing this deprecation will close GH#52501 + from pandas.core.dtypes.missing import _use_inf_as_na + + _use_inf_as_na(key) + + +with cf.config_prefix("mode"): + cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb) + +cf.deprecate_option( + # GH#51684 + "mode.use_inf_as_na", + "use_inf_as_na option is deprecated and will be removed in a future " + "version. Convert inf values to NaN before operating instead.", +) + +data_manager_doc = """ +: string + Internal data manager type; can be "block" or "array". Defaults to "block", + unless overridden by the 'PANDAS_DATA_MANAGER' environment variable (needs + to be set before pandas is imported). +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "data_manager", + # Get the default from an environment variable, if set, otherwise defaults + # to "block". This environment variable can be set for testing. + os.environ.get("PANDAS_DATA_MANAGER", "block"), + data_manager_doc, + validator=is_one_of_factory(["block", "array"]), + ) + +cf.deprecate_option( + # GH#55043 + "mode.data_manager", + "data_manager option is deprecated and will be removed in a future " + "version. Only the BlockManager will be available.", +) + + +# TODO better name? +copy_on_write_doc = """ +: bool + Use new copy-view behaviour using Copy-on-Write. Defaults to False, + unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable + (if set to "1" for True, needs to be set before pandas is imported). +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "copy_on_write", + # Get the default from an environment variable, if set, otherwise defaults + # to False. This environment variable can be set for testing. + "warn" + if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn" + else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", + copy_on_write_doc, + validator=is_one_of_factory([True, False, "warn"]), + ) + + +# user warnings +chained_assignment = """ +: string + Raise an exception, warn, or no action if trying to use chained assignment, + The default is warn +""" + +with cf.config_prefix("mode"): + cf.register_option( + "chained_assignment", + "warn", + chained_assignment, + validator=is_one_of_factory([None, "warn", "raise"]), + ) + + +string_storage_doc = """ +: string + The default storage for StringDtype. This option is ignored if + ``future.infer_string`` is set to True. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + ) + + +# Set up the io.excel specific reader configuration. +reader_engine_doc = """ +: string + The default Excel reader engine for '{ext}' files. Available options: + auto, {others}. +""" + +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] + + +with cf.config_prefix("io.excel.xls"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=is_one_of_factory(_xls_options + ["auto"]), + ) + +with cf.config_prefix("io.excel.xlsm"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=is_one_of_factory(_xlsm_options + ["auto"]), + ) + + +with cf.config_prefix("io.excel.xlsx"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=is_one_of_factory(_xlsx_options + ["auto"]), + ) + + +with cf.config_prefix("io.excel.ods"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=is_one_of_factory(_ods_options + ["auto"]), + ) + +with cf.config_prefix("io.excel.xlsb"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)), + validator=is_one_of_factory(_xlsb_options + ["auto"]), + ) + +# Set up the io.excel specific writer configuration. +writer_engine_doc = """ +: string + The default Excel writer engine for '{ext}' files. Available options: + auto, {others}. +""" + +_xlsm_options = ["openpyxl"] +_xlsx_options = ["openpyxl", "xlsxwriter"] +_ods_options = ["odf"] + + +with cf.config_prefix("io.excel.xlsm"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) + + +with cf.config_prefix("io.excel.xlsx"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) + + +with cf.config_prefix("io.excel.ods"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) + + +# Set up the io.parquet specific configuration. +parquet_engine_doc = """ +: string + The default parquet reader/writer engine. Available options: + 'auto', 'pyarrow', 'fastparquet', the default is 'auto' +""" + +with cf.config_prefix("io.parquet"): + cf.register_option( + "engine", + "auto", + parquet_engine_doc, + validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), + ) + + +# Set up the io.sql specific configuration. +sql_engine_doc = """ +: string + The default sql reader/writer engine. Available options: + 'auto', 'sqlalchemy', the default is 'auto' +""" + +with cf.config_prefix("io.sql"): + cf.register_option( + "engine", + "auto", + sql_engine_doc, + validator=is_one_of_factory(["auto", "sqlalchemy"]), + ) + +# -------- +# Plotting +# --------- + +plotting_backend_doc = """ +: str + The plotting backend to use. The default value is "matplotlib", the + backend provided with pandas. Other backends can be specified by + providing the name of the module that implements the backend. +""" + + +def register_plotting_backend_cb(key) -> None: + if key == "matplotlib": + # We defer matplotlib validation, since it's the default + return + from pandas.plotting._core import _get_plot_backend + + _get_plot_backend(key) + + +with cf.config_prefix("plotting"): + cf.register_option( + "backend", + defval="matplotlib", + doc=plotting_backend_doc, + validator=register_plotting_backend_cb, + ) + + +register_converter_doc = """ +: bool or 'auto'. + Whether to register converters with matplotlib's units registry for + dates, times, datetimes, and Periods. Toggling to False will remove + the converters, restoring any converters that pandas overwrote. +""" + + +def register_converter_cb(key) -> None: + from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, + ) + + if cf.get_option(key): + register_matplotlib_converters() + else: + deregister_matplotlib_converters() + + +with cf.config_prefix("plotting.matplotlib"): + cf.register_option( + "register_converters", + "auto", + register_converter_doc, + validator=is_one_of_factory(["auto", True, False]), + cb=register_converter_cb, + ) + +# ------ +# Styler +# ------ + +styler_sparse_index_doc = """ +: bool + Whether to sparsify the display of a hierarchical index. Setting to False will + display each explicit level element in a hierarchical key for each row. +""" + +styler_sparse_columns_doc = """ +: bool + Whether to sparsify the display of hierarchical columns. Setting to False will + display each explicit level element in a hierarchical key for each column. +""" + +styler_render_repr = """ +: str + Determine which output to use in Jupyter Notebook in {"html", "latex"}. +""" + +styler_max_elements = """ +: int + The maximum number of data-cell (' + assert expected in s.to_html() + + # only the value should be escaped before passing to the formatter + s = Styler(df, uuid_len=0).format("&{0}&", escape=escape) + expected = f'' + assert expected in s.to_html() + + # also test format_index() + styler = Styler(DataFrame(columns=[chars]), uuid_len=0) + styler.format_index("&{0}&", escape=None, axis=1) + assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{chars}&" + styler.format_index("&{0}&", escape=escape, axis=1) + assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{exp}&" + + +@pytest.mark.parametrize( + "chars, expected", + [ + ( + r"$ \$&%#_{}~^\ $ &%#_{}~^\ $", + "".join( + [ + r"$ \$&%#_{}~^\ $ ", + r"\&\%\#\_\{\}\textasciitilde \textasciicircum ", + r"\textbackslash \space \$", + ] + ), + ), + ( + r"\( &%#_{}~^\ \) &%#_{}~^\ \(", + "".join( + [ + r"\( &%#_{}~^\ \) ", + r"\&\%\#\_\{\}\textasciitilde \textasciicircum ", + r"\textbackslash \space \textbackslash (", + ] + ), + ), + ( + r"$\&%#_{}^\$", + r"\$\textbackslash \&\%\#\_\{\}\textasciicircum \textbackslash \$", + ), + ( + r"$ \frac{1}{2} $ \( \frac{1}{2} \)", + "".join( + [ + r"$ \frac{1}{2} $", + r" \textbackslash ( \textbackslash frac\{1\}\{2\} \textbackslash )", + ] + ), + ), + ], +) +def test_format_escape_latex_math(chars, expected): + # GH 51903 + # latex-math escape works for each DataFrame cell separately. If we have + # a combination of dollar signs and brackets, the dollar sign would apply. + df = DataFrame([[chars]]) + s = df.style.format("{0}", escape="latex-math") + assert s._translate(True, True)["body"][0][1]["display_value"] == expected + + +def test_format_escape_na_rep(): + # tests the na_rep is not escaped + df = DataFrame([['<>&"', None]]) + s = Styler(df, uuid_len=0).format("X&{0}>X", escape="html", na_rep="&") + ex = '' + expected2 = '' + assert ex in s.to_html() + assert expected2 in s.to_html() + + # also test for format_index() + df = DataFrame(columns=['<>&"', None]) + styler = Styler(df, uuid_len=0) + styler.format_index("X&{0}>X", escape="html", na_rep="&", axis=1) + ctx = styler._translate(True, True) + assert ctx["head"][0][1]["display_value"] == "X&<>&">X" + assert ctx["head"][0][2]["display_value"] == "&" + + +def test_format_escape_floats(styler): + # test given formatter for number format is not impacted by escape + s = styler.format("{:.1f}", escape="html") + for expected in [">0.0<", ">1.0<", ">-1.2<", ">-0.6<"]: + assert expected in s.to_html() + # tests precision of floats is not impacted by escape + s = styler.format(precision=1, escape="html") + for expected in [">0<", ">1<", ">-1.2<", ">-0.6<"]: + assert expected in s.to_html() + + +@pytest.mark.parametrize("formatter", [5, True, [2.0]]) +@pytest.mark.parametrize("func", ["format", "format_index"]) +def test_format_raises(styler, formatter, func): + with pytest.raises(TypeError, match="expected str or callable"): + getattr(styler, func)(formatter) + + +@pytest.mark.parametrize( + "precision, expected", + [ + (1, ["1.0", "2.0", "3.2", "4.6"]), + (2, ["1.00", "2.01", "3.21", "4.57"]), + (3, ["1.000", "2.009", "3.212", "4.566"]), + ], +) +def test_format_with_precision(precision, expected): + # Issue #13257 + df = DataFrame([[1.0, 2.0090, 3.2121, 4.566]], columns=[1.0, 2.0090, 3.2121, 4.566]) + styler = Styler(df) + styler.format(precision=precision) + styler.format_index(precision=precision, axis=1) + + ctx = styler._translate(True, True) + for col, exp in enumerate(expected): + assert ctx["body"][0][col + 1]["display_value"] == exp # format test + assert ctx["head"][0][col + 1]["display_value"] == exp # format_index test + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize( + "level, expected", + [ + (0, ["X", "X", "_", "_"]), # level int + ("zero", ["X", "X", "_", "_"]), # level name + (1, ["_", "_", "X", "X"]), # other level int + ("one", ["_", "_", "X", "X"]), # other level name + ([0, 1], ["X", "X", "X", "X"]), # both levels + ([0, "zero"], ["X", "X", "_", "_"]), # level int and name simultaneous + ([0, "one"], ["X", "X", "X", "X"]), # both levels as int and name + (["one", "zero"], ["X", "X", "X", "X"]), # both level names, reversed + ], +) +def test_format_index_level(axis, level, expected): + midx = MultiIndex.from_arrays([["_", "_"], ["_", "_"]], names=["zero", "one"]) + df = DataFrame([[1, 2], [3, 4]]) + if axis == 0: + df.index = midx + else: + df.columns = midx + + styler = df.style.format_index(lambda v: "X", level=level, axis=axis) + ctx = styler._translate(True, True) + + if axis == 0: # compare index + result = [ctx["body"][s][0]["display_value"] for s in range(2)] + result += [ctx["body"][s][1]["display_value"] for s in range(2)] + else: # compare columns + result = [ctx["head"][0][s + 1]["display_value"] for s in range(2)] + result += [ctx["head"][1][s + 1]["display_value"] for s in range(2)] + + assert expected == result + + +def test_format_subset(): + df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + ctx = df.style.format( + {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=IndexSlice[0, :] + )._translate(True, True) + expected = "0.1" + raw_11 = "1.123400" + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + assert ctx["body"][0][2]["display_value"] == "12.34%" + + ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, :])._translate(True, True) + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=IndexSlice["a"])._translate(True, True) + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][0][2]["display_value"] == "0.123400" + + ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, "a"])._translate(True, True) + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=IndexSlice[[0, 1], ["a"]])._translate( + True, True + ) + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1" + assert ctx["body"][0][2]["display_value"] == "0.123400" + assert ctx["body"][1][2]["display_value"] == raw_11 + + +@pytest.mark.parametrize("formatter", [None, "{:,.1f}"]) +@pytest.mark.parametrize("decimal", [".", "*"]) +@pytest.mark.parametrize("precision", [None, 2]) +@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)]) +def test_format_thousands(formatter, decimal, precision, func, col): + styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style + result = getattr(styler, func)( # testing float + thousands="_", formatter=formatter, decimal=decimal, precision=precision + )._translate(True, True) + assert "1_000_000" in result["body"][0][col]["display_value"] + + styler = DataFrame([[1000000]], index=[1000000]).style + result = getattr(styler, func)( # testing int + thousands="_", formatter=formatter, decimal=decimal, precision=precision + )._translate(True, True) + assert "1_000_000" in result["body"][0][col]["display_value"] + + styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style + result = getattr(styler, func)( # testing complex + thousands="_", formatter=formatter, decimal=decimal, precision=precision + )._translate(True, True) + assert "1_000_000" in result["body"][0][col]["display_value"] + + +@pytest.mark.parametrize("formatter", [None, "{:,.4f}"]) +@pytest.mark.parametrize("thousands", [None, ",", "*"]) +@pytest.mark.parametrize("precision", [None, 4]) +@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)]) +def test_format_decimal(formatter, thousands, precision, func, col): + styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style + result = getattr(styler, func)( # testing float + decimal="_", formatter=formatter, thousands=thousands, precision=precision + )._translate(True, True) + assert "000_123" in result["body"][0][col]["display_value"] + + styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style + result = getattr(styler, func)( # testing complex + decimal="_", formatter=formatter, thousands=thousands, precision=precision + )._translate(True, True) + assert "000_123" in result["body"][0][col]["display_value"] + + +def test_str_escape_error(): + msg = "`escape` only permitted in {'html', 'latex', 'latex-math'}, got " + with pytest.raises(ValueError, match=msg): + _str_escape("text", "bad_escape") + + with pytest.raises(ValueError, match=msg): + _str_escape("text", []) + + _str_escape(2.00, "bad_escape") # OK since dtype is float + + +def test_long_int_formatting(): + df = DataFrame(data=[[1234567890123456789]], columns=["test"]) + styler = df.style + ctx = styler._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "1234567890123456789" + + styler = df.style.format(thousands="_") + ctx = styler._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "1_234_567_890_123_456_789" + + +def test_format_options(): + df = DataFrame({"int": [2000, 1], "float": [1.009, None], "str": ["&<", "&~"]}) + ctx = df.style._translate(True, True) + + # test option: na_rep + assert ctx["body"][1][2]["display_value"] == "nan" + with option_context("styler.format.na_rep", "MISSING"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][1][2]["display_value"] == "MISSING" + + # test option: decimal and precision + assert ctx["body"][0][2]["display_value"] == "1.009000" + with option_context("styler.format.decimal", "_"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][0][2]["display_value"] == "1_009000" + with option_context("styler.format.precision", 2): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][0][2]["display_value"] == "1.01" + + # test option: thousands + assert ctx["body"][0][1]["display_value"] == "2000" + with option_context("styler.format.thousands", "_"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][0][1]["display_value"] == "2_000" + + # test option: escape + assert ctx["body"][0][3]["display_value"] == "&<" + assert ctx["body"][1][3]["display_value"] == "&~" + with option_context("styler.format.escape", "html"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][0][3]["display_value"] == "&<" + with option_context("styler.format.escape", "latex"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde " + with option_context("styler.format.escape", "latex-math"): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde " + + # test option: formatter + with option_context("styler.format.formatter", {"int": "{:,.2f}"}): + ctx_with_op = df.style._translate(True, True) + assert ctx_with_op["body"][0][1]["display_value"] == "2,000.00" + + +def test_precision_zero(df): + styler = Styler(df, precision=0) + ctx = styler._translate(True, True) + assert ctx["body"][0][2]["display_value"] == "-1" + assert ctx["body"][1][2]["display_value"] == "-1" + + +@pytest.mark.parametrize( + "formatter, exp", + [ + (lambda x: f"{x:.3f}", "9.000"), + ("{:.2f}", "9.00"), + ({0: "{:.1f}"}, "9.0"), + (None, "9"), + ], +) +def test_formatter_options_validator(formatter, exp): + df = DataFrame([[9]]) + with option_context("styler.format.formatter", formatter): + assert f" {exp} " in df.style.to_latex() + + +def test_formatter_options_raises(): + msg = "Value must be an instance of" + with pytest.raises(ValueError, match=msg): + with option_context("styler.format.formatter", ["bad", "type"]): + DataFrame().style.to_latex() + + +def test_1level_multiindex(): + # GH 43383 + midx = MultiIndex.from_product([[1, 2]], names=[""]) + df = DataFrame(-1, index=midx, columns=[0, 1]) + ctx = df.style._translate(True, True) + assert ctx["body"][0][0]["display_value"] == "1" + assert ctx["body"][0][0]["is_visible"] is True + assert ctx["body"][1][0]["display_value"] == "2" + assert ctx["body"][1][0]["is_visible"] is True + + +def test_boolean_format(): + # gh 46384: booleans do not collapse to integer representation on display + df = DataFrame([[True, False]]) + ctx = df.style._translate(True, True) + assert ctx["body"][0][1]["display_value"] is True + assert ctx["body"][0][2]["display_value"] is False + + +@pytest.mark.parametrize( + "hide, labels", + [ + (False, [1, 2]), + (True, [1, 2, 3, 4]), + ], +) +def test_relabel_raise_length(styler_multi, hide, labels): + if hide: + styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")]) + with pytest.raises(ValueError, match="``labels`` must be of length equal"): + styler_multi.relabel_index(labels=labels) + + +def test_relabel_index(styler_multi): + labels = [(1, 2), (3, 4)] + styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")]) + styler_multi.relabel_index(labels=labels) + ctx = styler_multi._translate(True, True) + assert {"value": "X", "display_value": 1}.items() <= ctx["body"][0][0].items() + assert {"value": "y", "display_value": 2}.items() <= ctx["body"][0][1].items() + assert {"value": "Y", "display_value": 3}.items() <= ctx["body"][1][0].items() + assert {"value": "x", "display_value": 4}.items() <= ctx["body"][1][1].items() + + +def test_relabel_columns(styler_multi): + labels = [(1, 2), (3, 4)] + styler_multi.hide(axis=1, subset=[("A", "a"), ("B", "b")]) + styler_multi.relabel_index(axis=1, labels=labels) + ctx = styler_multi._translate(True, True) + assert {"value": "A", "display_value": 1}.items() <= ctx["head"][0][3].items() + assert {"value": "B", "display_value": 3}.items() <= ctx["head"][0][4].items() + assert {"value": "b", "display_value": 2}.items() <= ctx["head"][1][3].items() + assert {"value": "a", "display_value": 4}.items() <= ctx["head"][1][4].items() + + +def test_relabel_roundtrip(styler): + styler.relabel_index(["{}", "{}"]) + ctx = styler._translate(True, True) + assert {"value": "x", "display_value": "x"}.items() <= ctx["body"][0][0].items() + assert {"value": "y", "display_value": "y"}.items() <= ctx["body"][1][0].items() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_highlight.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_highlight.py new file mode 100644 index 0000000000000000000000000000000000000000..3d59719010ee03cc53373a1c96f5f8c5611d7681 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_highlight.py @@ -0,0 +1,218 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + DataFrame, + IndexSlice, +) + +pytest.importorskip("jinja2") + +from pandas.io.formats.style import Styler + + +@pytest.fixture(params=[(None, "float64"), (NA, "Int64")]) +def df(request): + # GH 45804 + return DataFrame( + {"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1] + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0) + + +def test_highlight_null(styler): + result = styler.highlight_null()._compute().ctx + expected = { + (1, 0): [("background-color", "red")], + (1, 1): [("background-color", "red")], + } + assert result == expected + + +def test_highlight_null_subset(styler): + # GH 31345 + result = ( + styler.highlight_null(color="red", subset=["A"]) + .highlight_null(color="green", subset=["B"]) + ._compute() + .ctx + ) + expected = { + (1, 0): [("background-color", "red")], + (1, 1): [("background-color", "green")], + } + assert result == expected + + +@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"]) +def test_highlight_minmax_basic(df, f): + expected = { + (0, 1): [("background-color", "red")], + # ignores NaN row, + (2, 0): [("background-color", "red")], + } + if f == "highlight_min": + df = -df + result = getattr(df.style, f)(axis=1, color="red")._compute().ctx + assert result == expected + + +@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"]) +@pytest.mark.parametrize( + "kwargs", + [ + {"axis": None, "color": "red"}, # test axis + {"axis": 0, "subset": ["A"], "color": "red"}, # test subset and ignores NaN + {"axis": None, "props": "background-color: red"}, # test props + ], +) +def test_highlight_minmax_ext(df, f, kwargs): + expected = {(2, 0): [("background-color", "red")]} + if f == "highlight_min": + df = -df + result = getattr(df.style, f)(**kwargs)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"]) +@pytest.mark.parametrize("axis", [None, 0, 1]) +def test_highlight_minmax_nulls(f, axis): + # GH 42750 + expected = { + (1, 0): [("background-color", "yellow")], + (1, 1): [("background-color", "yellow")], + } + if axis == 1: + expected.update({(2, 1): [("background-color", "yellow")]}) + + if f == "highlight_max": + df = DataFrame({"a": [NA, 1, None], "b": [np.nan, 1, -1]}) + else: + df = DataFrame({"a": [NA, -1, None], "b": [np.nan, -1, 1]}) + + result = getattr(df.style, f)(axis=axis)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "kwargs", + [ + {"left": 0, "right": 1}, # test basic range + {"left": 0, "right": 1, "props": "background-color: yellow"}, # test props + {"left": -100, "right": 100, "subset": IndexSlice[[0, 1], :]}, # test subset + {"left": 0, "subset": IndexSlice[[0, 1], :]}, # test no right + {"right": 1}, # test no left + {"left": [0, 0, 11], "axis": 0}, # test left as sequence + {"left": DataFrame({"A": [0, 0, 11], "B": [1, 1, 11]}), "axis": None}, # axis + {"left": 0, "right": [0, 1], "axis": 1}, # test sequence right + ], +) +def test_highlight_between(styler, kwargs): + expected = { + (0, 0): [("background-color", "yellow")], + (0, 1): [("background-color", "yellow")], + } + result = styler.highlight_between(**kwargs)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "arg, map, axis", + [ + ("left", [1, 2], 0), # 0 axis has 3 elements not 2 + ("left", [1, 2, 3], 1), # 1 axis has 2 elements not 3 + ("left", np.array([[1, 2], [1, 2]]), None), # df is (2,3) not (2,2) + ("right", [1, 2], 0), # same tests as above for 'right' not 'left' + ("right", [1, 2, 3], 1), # .. + ("right", np.array([[1, 2], [1, 2]]), None), # .. + ], +) +def test_highlight_between_raises(arg, styler, map, axis): + msg = f"supplied '{arg}' is not correct shape" + with pytest.raises(ValueError, match=msg): + styler.highlight_between(**{arg: map, "axis": axis})._compute() + + +def test_highlight_between_raises2(styler): + msg = "values can be 'both', 'left', 'right', or 'neither'" + with pytest.raises(ValueError, match=msg): + styler.highlight_between(inclusive="badstring")._compute() + + with pytest.raises(ValueError, match=msg): + styler.highlight_between(inclusive=1)._compute() + + +@pytest.mark.parametrize( + "inclusive, expected", + [ + ( + "both", + { + (0, 0): [("background-color", "yellow")], + (0, 1): [("background-color", "yellow")], + }, + ), + ("neither", {}), + ("left", {(0, 0): [("background-color", "yellow")]}), + ("right", {(0, 1): [("background-color", "yellow")]}), + ], +) +def test_highlight_between_inclusive(styler, inclusive, expected): + kwargs = {"left": 0, "right": 1, "subset": IndexSlice[[0, 1], :]} + result = styler.highlight_between(**kwargs, inclusive=inclusive)._compute() + assert result.ctx == expected + + +@pytest.mark.parametrize( + "kwargs", + [ + {"q_left": 0.5, "q_right": 1, "axis": 0}, # base case + {"q_left": 0.5, "q_right": 1, "axis": None}, # test axis + {"q_left": 0, "q_right": 1, "subset": IndexSlice[2, :]}, # test subset + {"q_left": 0.5, "axis": 0}, # test no high + {"q_right": 1, "subset": IndexSlice[2, :], "axis": 1}, # test no low + {"q_left": 0.5, "axis": 0, "props": "background-color: yellow"}, # tst prop + ], +) +def test_highlight_quantile(styler, kwargs): + expected = { + (2, 0): [("background-color", "yellow")], + (2, 1): [("background-color", "yellow")], + } + result = styler.highlight_quantile(**kwargs)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "f,kwargs", + [ + ("highlight_min", {"axis": 1, "subset": IndexSlice[1, :]}), + ("highlight_max", {"axis": 0, "subset": [0]}), + ("highlight_quantile", {"axis": None, "q_left": 0.6, "q_right": 0.8}), + ("highlight_between", {"subset": [0]}), + ], +) +@pytest.mark.parametrize( + "df", + [ + DataFrame([[0, 10], [20, 30]], dtype=int), + DataFrame([[0, 10], [20, 30]], dtype=float), + DataFrame([[0, 10], [20, 30]], dtype="datetime64[ns]"), + DataFrame([[0, 10], [20, 30]], dtype=str), + DataFrame([[0, 10], [20, 30]], dtype="timedelta64[ns]"), + ], +) +def test_all_highlight_dtypes(f, kwargs, df): + if f == "highlight_quantile" and isinstance(df.iloc[0, 0], (str)): + return None # quantile incompatible with str + if f == "highlight_between": + kwargs["left"] = df.iloc[1, 0] # set the range low for testing + + expected = {(1, 0): [("background-color", "yellow")]} + result = getattr(df.style, f)(**kwargs)._compute().ctx + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_html.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_html.py new file mode 100644 index 0000000000000000000000000000000000000000..1e345eb82ed3c31e7a5e0f89fa574aea84923dd7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_html.py @@ -0,0 +1,1009 @@ +from textwrap import ( + dedent, + indent, +) + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, + option_context, +) + +jinja2 = pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler + + +@pytest.fixture +def env(): + loader = jinja2.PackageLoader("pandas", "io/formats/templates") + env = jinja2.Environment(loader=loader, trim_blocks=True) + return env + + +@pytest.fixture +def styler(): + return Styler(DataFrame([[2.61], [2.69]], index=["a", "b"], columns=["A"])) + + +@pytest.fixture +def styler_mi(): + midx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + return Styler(DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=midx)) + + +@pytest.fixture +def tpl_style(env): + return env.get_template("html_style.tpl") + + +@pytest.fixture +def tpl_table(env): + return env.get_template("html_table.tpl") + + +def test_html_template_extends_options(): + # make sure if templates are edited tests are updated as are setup fixtures + # to understand the dependency + with open("pandas/io/formats/templates/html.tpl", encoding="utf-8") as file: + result = file.read() + assert "{% include html_style_tpl %}" in result + assert "{% include html_table_tpl %}" in result + + +def test_exclude_styles(styler): + result = styler.to_html(exclude_styles=True, doctype_html=True) + expected = dedent( + """\ + + + + + + +
) elements that will be rendered before + trimming will occur over columns, rows or both if needed. +""" + +styler_max_rows = """ +: int, optional + The maximum number of rows that will be rendered. May still be reduced to + satisfy ``max_elements``, which takes precedence. +""" + +styler_max_columns = """ +: int, optional + The maximum number of columns that will be rendered. May still be reduced to + satisfy ``max_elements``, which takes precedence. +""" + +styler_precision = """ +: int + The precision for floats and complex numbers. +""" + +styler_decimal = """ +: str + The character representation for the decimal separator for floats and complex. +""" + +styler_thousands = """ +: str, optional + The character representation for thousands separator for floats, int and complex. +""" + +styler_na_rep = """ +: str, optional + The string representation for values identified as missing. +""" + +styler_escape = """ +: str, optional + Whether to escape certain characters according to the given context; html or latex. +""" + +styler_formatter = """ +: str, callable, dict, optional + A formatter object to be used as default within ``Styler.format``. +""" + +styler_multirow_align = """ +: {"c", "t", "b"} + The specifier for vertical alignment of sparsified LaTeX multirows. +""" + +styler_multicol_align = r""" +: {"r", "c", "l", "naive-l", "naive-r"} + The specifier for horizontal alignment of sparsified LaTeX multicolumns. Pipe + decorators can also be added to non-naive values to draw vertical + rules, e.g. "\|r" will draw a rule on the left side of right aligned merged cells. +""" + +styler_hrules = """ +: bool + Whether to add horizontal rules on top and bottom and below the headers. +""" + +styler_environment = """ +: str + The environment to replace ``\\begin{table}``. If "longtable" is used results + in a specific longtable environment format. +""" + +styler_encoding = """ +: str + The encoding used for output HTML and LaTeX files. +""" + +styler_mathjax = """ +: bool + If False will render special CSS classes to table attributes that indicate Mathjax + will not be used in Jupyter Notebook. +""" + +with cf.config_prefix("styler"): + cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool) + + cf.register_option( + "sparse.columns", True, styler_sparse_columns_doc, validator=is_bool + ) + + cf.register_option( + "render.repr", + "html", + styler_render_repr, + validator=is_one_of_factory(["html", "latex"]), + ) + + cf.register_option( + "render.max_elements", + 2**18, + styler_max_elements, + validator=is_nonnegative_int, + ) + + cf.register_option( + "render.max_rows", + None, + styler_max_rows, + validator=is_nonnegative_int, + ) + + cf.register_option( + "render.max_columns", + None, + styler_max_columns, + validator=is_nonnegative_int, + ) + + cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str) + + cf.register_option("format.decimal", ".", styler_decimal, validator=is_str) + + cf.register_option( + "format.precision", 6, styler_precision, validator=is_nonnegative_int + ) + + cf.register_option( + "format.thousands", + None, + styler_thousands, + validator=is_instance_factory([type(None), str]), + ) + + cf.register_option( + "format.na_rep", + None, + styler_na_rep, + validator=is_instance_factory([type(None), str]), + ) + + cf.register_option( + "format.escape", + None, + styler_escape, + validator=is_one_of_factory([None, "html", "latex", "latex-math"]), + ) + + cf.register_option( + "format.formatter", + None, + styler_formatter, + validator=is_instance_factory([type(None), dict, Callable, str]), + ) + + cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool) + + cf.register_option( + "latex.multirow_align", + "c", + styler_multirow_align, + validator=is_one_of_factory(["c", "t", "b", "naive"]), + ) + + val_mca = ["r", "|r|", "|r", "r|", "c", "|c|", "|c", "c|", "l", "|l|", "|l", "l|"] + val_mca += ["naive-l", "naive-r"] + cf.register_option( + "latex.multicol_align", + "r", + styler_multicol_align, + validator=is_one_of_factory(val_mca), + ) + + cf.register_option("latex.hrules", False, styler_hrules, validator=is_bool) + + cf.register_option( + "latex.environment", + None, + styler_environment, + validator=is_instance_factory([type(None), str]), + ) + + +with cf.config_prefix("future"): + cf.register_option( + "infer_string", + False, + "Whether to infer sequence of str objects as pyarrow string " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) + + cf.register_option( + "no_silent_downcasting", + False, + "Whether to opt-in to the future behavior which will *not* silently " + "downcast results from Series and DataFrame `where`, `mask`, and `clip` " + "methods. " + "Silent downcasting will be removed in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/construction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/construction.py new file mode 100644 index 0000000000000000000000000000000000000000..f8250ae475a10317c7731c10c475e88cc33f030b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/construction.py @@ -0,0 +1,824 @@ +""" +Constructor functions intended to be shared by pd.array, Series.__init__, +and Index.__new__. + +These should not depend on core.internals. +""" +from __future__ import annotations + +from collections.abc import Sequence +from typing import ( + TYPE_CHECKING, + Optional, + Union, + cast, + overload, +) +import warnings + +import numpy as np +from numpy import ma + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import lib +from pandas._libs.tslibs import ( + Period, + get_supported_dtype, + is_supported_dtype, +) +from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Dtype, + DtypeObj, + T, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + construct_1d_object_array_from_listlike, + maybe_cast_to_datetime, + maybe_cast_to_integer_array, + maybe_convert_platform, + maybe_infer_to_datetimelike, + maybe_promote, +) +from pandas.core.dtypes.common import ( + is_list_like, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCExtensionArray, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +import pandas.core.common as com + +if TYPE_CHECKING: + from pandas import ( + Index, + Series, + ) + from pandas.core.arrays.base import ExtensionArray + + +def array( + data: Sequence[object] | AnyArrayLike, + dtype: Dtype | None = None, + copy: bool = True, +) -> ExtensionArray: + """ + Create an array. + + Parameters + ---------- + data : Sequence of objects + The scalars inside `data` should be instances of the + scalar type for `dtype`. It's expected that `data` + represents a 1-dimensional array of data. + + When `data` is an Index or Series, the underlying array + will be extracted from `data`. + + dtype : str, np.dtype, or ExtensionDtype, optional + The dtype to use for the array. This may be a NumPy + dtype or an extension type registered with pandas using + :meth:`pandas.api.extensions.register_extension_dtype`. + + If not specified, there are two possibilities: + + 1. When `data` is a :class:`Series`, :class:`Index`, or + :class:`ExtensionArray`, the `dtype` will be taken + from the data. + 2. Otherwise, pandas will attempt to infer the `dtype` + from the data. + + Note that when `data` is a NumPy array, ``data.dtype`` is + *not* used for inferring the array type. This is because + NumPy cannot represent all the types of data that can be + held in extension arrays. + + Currently, pandas will infer an extension dtype for sequences of + + ============================== ======================================= + Scalar Type Array Type + ============================== ======================================= + :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` + :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` + :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` + :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`float` :class:`pandas.arrays.FloatingArray` + :class:`str` :class:`pandas.arrays.StringArray` or + :class:`pandas.arrays.ArrowStringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` + ============================== ======================================= + + The ExtensionArray created when the scalar type is :class:`str` is determined by + ``pd.options.mode.string_storage`` if the dtype is not explicitly given. + + For all other cases, NumPy's usual inference rules will be used. + copy : bool, default True + Whether to copy the data, even if not necessary. Depending + on the type of `data`, creating the new array may require + copying data, even if ``copy=False``. + + Returns + ------- + ExtensionArray + The newly created array. + + Raises + ------ + ValueError + When `data` is not 1-dimensional. + + See Also + -------- + numpy.array : Construct a NumPy array. + Series : Construct a pandas Series. + Index : Construct a pandas Index. + arrays.NumpyExtensionArray : ExtensionArray wrapping a NumPy array. + Series.array : Extract the array stored within a Series. + + Notes + ----- + Omitting the `dtype` argument means pandas will attempt to infer the + best array type from the values in the data. As new array types are + added by pandas and 3rd party libraries, the "best" array type may + change. We recommend specifying `dtype` to ensure that + + 1. the correct array type for the data is returned + 2. the returned array type doesn't change as new extension types + are added by pandas and third-party libraries + + Additionally, if the underlying memory representation of the returned + array matters, we recommend specifying the `dtype` as a concrete object + rather than a string alias or allowing it to be inferred. For example, + a future version of pandas or a 3rd-party library may include a + dedicated ExtensionArray for string data. In this event, the following + would no longer return a :class:`arrays.NumpyExtensionArray` backed by a + NumPy array. + + >>> pd.array(['a', 'b'], dtype=str) + + ['a', 'b'] + Length: 2, dtype: str32 + + This would instead return the new ExtensionArray dedicated for string + data. If you really need the new array to be backed by a NumPy array, + specify that in the dtype. + + >>> pd.array(['a', 'b'], dtype=np.dtype(" + ['a', 'b'] + Length: 2, dtype: str32 + + Finally, Pandas has arrays that mostly overlap with NumPy + + * :class:`arrays.DatetimeArray` + * :class:`arrays.TimedeltaArray` + + When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is + passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` + rather than a ``NumpyExtensionArray``. This is for symmetry with the case of + timezone-aware data, which NumPy does not natively support. + + >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') + + ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] + Length: 2, dtype: datetime64[ns] + + >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') + + ['0 days 01:00:00', '0 days 02:00:00'] + Length: 2, dtype: timedelta64[ns] + + Examples + -------- + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. + + >>> pd.array([1, 2]) + + [1, 2] + Length: 2, dtype: Int64 + + >>> pd.array([1, 2, np.nan]) + + [1, 2, ] + Length: 3, dtype: Int64 + + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: Float64 + + >>> pd.array(["a", None, "c"]) + + ['a', , 'c'] + Length: 3, dtype: string + + >>> with pd.option_context("string_storage", "pyarrow"): + ... arr = pd.array(["a", None, "c"]) + ... + >>> arr + + ['a', , 'c'] + Length: 3, dtype: string + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] + + You can use the string alias for `dtype` + + >>> pd.array(['a', 'b', 'a'], dtype='category') + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] + + Or specify the actual dtype + + >>> pd.array(['a', 'b', 'a'], + ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + ['a', 'b', 'a'] + Categories (3, object): ['a' < 'b' < 'c'] + + If pandas does not infer a dedicated extension type a + :class:`arrays.NumpyExtensionArray` is returned. + + >>> pd.array([1 + 1j, 3 + 2j]) + + [(1+1j), (3+2j)] + Length: 2, dtype: complex128 + + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.NumpyExtensionArray`. Specify the + `dtype` as a NumPy dtype if you need to ensure there's no future change in + behavior. + + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 + + `data` must be 1-dimensional. A ValueError is raised when the input + has the wrong dimensionality. + + >>> pd.array(1) + Traceback (most recent call last): + ... + ValueError: Cannot pass scalar '1' to 'pandas.array'. + """ + from pandas.core.arrays import ( + BooleanArray, + DatetimeArray, + ExtensionArray, + FloatingArray, + IntegerArray, + IntervalArray, + NumpyExtensionArray, + PeriodArray, + TimedeltaArray, + ) + from pandas.core.arrays.string_ import StringDtype + + if lib.is_scalar(data): + msg = f"Cannot pass scalar '{data}' to 'pandas.array'." + raise ValueError(msg) + elif isinstance(data, ABCDataFrame): + raise TypeError("Cannot pass DataFrame to 'pandas.array'") + + if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)): + # Note: we exclude np.ndarray here, will do type inference on it + dtype = data.dtype + + data = extract_array(data, extract_numpy=True) + + # this returns None for not-found dtypes. + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(data, ExtensionArray) and (dtype is None or data.dtype == dtype): + # e.g. TimedeltaArray[s], avoid casting to NumpyExtensionArray + if copy: + return data.copy() + return data + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) + + if dtype is None: + inferred_dtype = lib.infer_dtype(data, skipna=True) + if inferred_dtype == "period": + period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data) + return PeriodArray._from_sequence(period_data, copy=copy) + + elif inferred_dtype == "interval": + return IntervalArray(data, copy=copy) + + elif inferred_dtype.startswith("datetime"): + # datetime, datetime64 + try: + return DatetimeArray._from_sequence(data, copy=copy) + except ValueError: + # Mixture of timezones, fall back to NumpyExtensionArray + pass + + elif inferred_dtype.startswith("timedelta"): + # timedelta, timedelta64 + return TimedeltaArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "string": + # StringArray/ArrowStringArray depending on pd.options.mode.string_storage + dtype = StringDtype() + cls = dtype.construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) + + elif inferred_dtype == "integer": + return IntegerArray._from_sequence(data, copy=copy) + elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data): + return FloatingArray._from_sequence(data, copy=copy) + elif ( + inferred_dtype in ("floating", "mixed-integer-float") + and getattr(data, "dtype", None) != np.float16 + ): + # GH#44715 Exclude np.float16 bc FloatingArray does not support it; + # we will fall back to NumpyExtensionArray. + return FloatingArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) + + # Pandas overrides NumPy for + # 1. datetime64[ns,us,ms,s] + # 2. timedelta64[ns,us,ms,s] + # so that a DatetimeArray is returned. + if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): + return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) + if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): + return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) + + elif lib.is_np_dtype(dtype, "mM"): + warnings.warn( + r"datetime64 and timedelta64 dtype resolutions other than " + r"'s', 'ms', 'us', and 'ns' are deprecated. " + r"In future releases passing unsupported resolutions will " + r"raise an exception.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy) + + +_typs = frozenset( + { + "index", + "rangeindex", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex", + "intervalindex", + "series", + } +) + + +@overload +def extract_array( + obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ... +) -> ArrayLike: + ... + + +@overload +def extract_array( + obj: T, extract_numpy: bool = ..., extract_range: bool = ... +) -> T | ArrayLike: + ... + + +def extract_array( + obj: T, extract_numpy: bool = False, extract_range: bool = False +) -> T | ArrayLike: + """ + Extract the ndarray or ExtensionArray from a Series or Index. + + For all other types, `obj` is just returned as is. + + Parameters + ---------- + obj : object + For Series / Index, the underlying ExtensionArray is unboxed. + + extract_numpy : bool, default False + Whether to extract the ndarray from a NumpyExtensionArray. + + extract_range : bool, default False + If we have a RangeIndex, return range._values if True + (which is a materialized integer ndarray), otherwise return unchanged. + + Returns + ------- + arr : object + + Examples + -------- + >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] + + Other objects like lists, arrays, and DataFrames are just passed through. + + >>> extract_array([1, 2, 3]) + [1, 2, 3] + + For an ndarray-backed Series / Index the ndarray is returned. + + >>> extract_array(pd.Series([1, 2, 3])) + array([1, 2, 3]) + + To extract all the way down to the ndarray, pass ``extract_numpy=True``. + + >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) + array([1, 2, 3]) + """ + typ = getattr(obj, "_typ", None) + if typ in _typs: + # i.e. isinstance(obj, (ABCIndex, ABCSeries)) + if typ == "rangeindex": + if extract_range: + # error: "T" has no attribute "_values" + return obj._values # type: ignore[attr-defined] + return obj + + # error: "T" has no attribute "_values" + return obj._values # type: ignore[attr-defined] + + elif extract_numpy and typ == "npy_extension": + # i.e. isinstance(obj, ABCNumpyExtensionArray) + # error: "T" has no attribute "to_numpy" + return obj.to_numpy() # type: ignore[attr-defined] + + return obj + + +def ensure_wrapped_if_datetimelike(arr): + """ + Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. + """ + if isinstance(arr, np.ndarray): + if arr.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + dtype = get_supported_dtype(arr.dtype) + return DatetimeArray._from_sequence(arr, dtype=dtype) + + elif arr.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + dtype = get_supported_dtype(arr.dtype) + return TimedeltaArray._from_sequence(arr, dtype=dtype) + + return arr + + +def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray: + """ + Convert numpy MaskedArray to ensure mask is softened. + """ + mask = ma.getmaskarray(data) + if mask.any(): + dtype, fill_value = maybe_promote(data.dtype, np.nan) + dtype = cast(np.dtype, dtype) + data = ma.asarray(data.astype(dtype, copy=True)) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + return data + + +def sanitize_array( + data, + index: Index | None, + dtype: DtypeObj | None = None, + copy: bool = False, + *, + allow_2d: bool = False, +) -> ArrayLike: + """ + Sanitize input data to an ndarray or ExtensionArray, copy if specified, + coerce to the dtype if specified. + + Parameters + ---------- + data : Any + index : Index or None, default None + dtype : np.dtype, ExtensionDtype, or None, default None + copy : bool, default False + allow_2d : bool, default False + If False, raise if we have a 2D Arraylike. + + Returns + ------- + np.ndarray or ExtensionArray + """ + original_dtype = dtype + if isinstance(data, ma.MaskedArray): + data = sanitize_masked_array(data) + + if isinstance(dtype, NumpyEADtype): + # Avoid ending up with a NumpyExtensionArray + dtype = dtype.numpy_dtype + + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray + data = extract_array(data, extract_numpy=True, extract_range=True) + + if isinstance(data, np.ndarray) and data.ndim == 0: + if dtype is None: + dtype = data.dtype + data = lib.item_from_zerodim(data) + elif isinstance(data, range): + # GH#16804 + data = range_to_ndarray(data) + copy = False + + if not is_list_like(data): + if index is None: + raise ValueError("index must be specified when data is not list-like") + if ( + isinstance(data, str) + and using_pyarrow_string_dtype() + and original_dtype is None + ): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") + data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + + return data + + elif isinstance(data, ABCExtensionArray): + # it is already ensured above this is not a NumpyExtensionArray + # Until GH#49309 is fixed this check needs to come before the + # ExtensionDtype check + if dtype is not None: + subarr = data.astype(dtype, copy=copy) + elif copy: + subarr = data.copy() + else: + subarr = data + + elif isinstance(dtype, ExtensionDtype): + # create an extension array from its dtype + _sanitize_non_ordered(data) + cls = dtype.construct_array_type() + subarr = cls._from_sequence(data, dtype=dtype, copy=copy) + + # GH#846 + elif isinstance(data, np.ndarray): + if isinstance(data, np.matrix): + data = data.A + + if dtype is None: + subarr = data + if data.dtype == object: + subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(storage="pyarrow_numpy") + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) + + if subarr is data and copy: + subarr = subarr.copy() + + else: + # we will try to copy by-definition here + subarr = _try_cast(data, dtype, copy) + + elif hasattr(data, "__array__"): + # e.g. dask array GH#38645 + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) + return sanitize_array( + data, + index=index, + dtype=dtype, + copy=False, + allow_2d=allow_2d, + ) + + else: + _sanitize_non_ordered(data) + # materialize e.g. generators, convert e.g. tuples, abc.ValueView + data = list(data) + + if len(data) == 0 and dtype is None: + # We default to float64, matching numpy + subarr = np.array([], dtype=np.float64) + + elif dtype is not None: + subarr = _try_cast(data, dtype, copy) + + else: + subarr = maybe_convert_platform(data) + if subarr.dtype == object: + subarr = cast(np.ndarray, subarr) + subarr = maybe_infer_to_datetimelike(subarr) + + subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d) + + if isinstance(subarr, np.ndarray): + # at this point we should have dtype be None or subarr.dtype == dtype + dtype = cast(np.dtype, dtype) + subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) + + return subarr + + +def range_to_ndarray(rng: range) -> np.ndarray: + """ + Cast a range object to ndarray. + """ + # GH#30171 perf avoid realizing range as a list in np.array + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64") + except OverflowError: + # GH#30173 handling for ranges that overflow int64 + if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop): + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64") + except OverflowError: + arr = construct_1d_object_array_from_listlike(list(rng)) + else: + arr = construct_1d_object_array_from_listlike(list(rng)) + return arr + + +def _sanitize_non_ordered(data) -> None: + """ + Raise only for unordered sets, e.g., not for dict_keys + """ + if isinstance(data, (set, frozenset)): + raise TypeError(f"'{type(data).__name__}' type is unordered") + + +def _sanitize_ndim( + result: ArrayLike, + data, + dtype: DtypeObj | None, + index: Index | None, + *, + allow_2d: bool = False, +) -> ArrayLike: + """ + Ensure we have a 1-dimensional result array. + """ + if getattr(result, "ndim", 0) == 0: + raise ValueError("result should be arraylike with ndim > 0") + + if result.ndim == 1: + # the result that we want + result = _maybe_repeat(result, index) + + elif result.ndim > 1: + if isinstance(data, np.ndarray): + if allow_2d: + return result + raise ValueError( + f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead" + ) + if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype): + # i.e. NumpyEADtype("O") + + result = com.asarray_tuplesafe(data, dtype=np.dtype("object")) + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) + else: + # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type + # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str, + # dtype[Any], None]" + result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type] + return result + + +def _sanitize_str_dtypes( + result: np.ndarray, data, dtype: np.dtype | None, copy: bool +) -> np.ndarray: + """ + Ensure we have a dtype that is supported by pandas. + """ + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(result.dtype.type, str): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, result has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) + return result + + +def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike: + """ + If we have a length-1 array and an index describing how long we expect + the result to be, repeat the array. + """ + if index is not None: + if 1 == len(arr) != len(index): + arr = arr.repeat(len(index)) + return arr + + +def _try_cast( + arr: list | np.ndarray, + dtype: np.dtype, + copy: bool, +) -> ArrayLike: + """ + Convert input to numpy ndarray and optionally cast to a given dtype. + + Parameters + ---------- + arr : ndarray or list + Excludes: ExtensionArray, Series, Index. + dtype : np.dtype + copy : bool + If False, don't copy the data if not needed. + + Returns + ------- + np.ndarray or ExtensionArray + """ + is_ndarray = isinstance(arr, np.ndarray) + + if dtype == object: + if not is_ndarray: + subarr = construct_1d_object_array_from_listlike(arr) + return subarr + return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy) + + elif dtype.kind == "U": + # TODO: test cases with arr.dtype.kind in "mM" + if is_ndarray: + arr = cast(np.ndarray, arr) + shape = arr.shape + if arr.ndim > 1: + arr = arr.ravel() + else: + shape = (len(arr),) + return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape( + shape + ) + + elif dtype.kind in "mM": + return maybe_cast_to_datetime(arr, dtype) + + # GH#15832: Check if we are requesting a numeric dtype and + # that we can convert the data to the requested dtype. + elif dtype.kind in "iu": + # this will raise if we have e.g. floats + + subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) + else: + subarr = np.array(arr, dtype=dtype, copy=copy) + + return subarr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/frame.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/frame.py new file mode 100644 index 0000000000000000000000000000000000000000..afcd4d014316e34cd06db90fbea345a2ce0e9daa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/frame.py @@ -0,0 +1,12697 @@ +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" +from __future__ import annotations + +import collections +from collections import abc +from collections.abc import ( + Hashable, + Iterable, + Iterator, + Mapping, + Sequence, +) +import functools +from inspect import signature +from io import StringIO +import itertools +import operator +import sys +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) +import warnings + +import numpy as np +from numpy import ma + +from pandas._config import ( + get_option, + using_copy_on_write, + warn_copy_on_write, +) +from pandas._config.config import _get_option + +from pandas._libs import ( + algos as libalgos, + lib, + properties, +) +from pandas._libs.hashtable import duplicated +from pandas._libs.lib import is_range_indexer +from pandas.compat import PYPY +from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import ( + ChainedAssignmentError, + InvalidIndexError, + _chained_assignment_method_msg, + _chained_assignment_msg, + _chained_assignment_warning_method_msg, + _chained_assignment_warning_msg, +) +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_nonkeyword_arguments, + doc, +) +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) +from pandas.util._validators import ( + validate_ascending, + validate_bool_kwarg, + validate_percentile, +) + +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + construct_1d_arraylike_from_scalar, + construct_2d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_box_native, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + infer_dtype_from_object, + is_1d_only_ea_dtype, + is_array_like, + is_bool_dtype, + is_dataclass, + is_dict_like, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_scalar, + is_sequence, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, + ExtensionDtype, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import ( + algorithms, + common as com, + nanops, + ops, + roperator, +) +from pandas.core.accessor import CachedAccessor +from pandas.core.apply import reconstruct_and_relabel_result +from pandas.core.array_algos.take import take_2d_multi +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ( + BaseMaskedArray, + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + sanitize_array, + sanitize_masked_array, +) +from pandas.core.generic import ( + NDFrame, + make_doc, +) +from pandas.core.indexers import check_key_length +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + default_index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.core.indexes.multi import ( + MultiIndex, + maybe_droplevels, +) +from pandas.core.indexing import ( + check_bool_indexer, + check_dict_or_set_indexers, +) +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) +from pandas.core.internals.construction import ( + arrays_to_mgr, + dataclasses_to_dicts, + dict_to_mgr, + mgr_to_mgr, + ndarray_to_mgr, + nested_data_to_arrays, + rec_array_to_mgr, + reorder_arrays, + to_arrays, + treat_as_nested, +) +from pandas.core.methods import selectn +from pandas.core.reshape.melt import melt +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ( + get_group_index, + lexsort_indexer, + nargsort, +) + +from pandas.io.common import get_handle +from pandas.io.formats import ( + console, + format as fmt, +) +from pandas.io.formats.info import ( + INFO_DOCSTRING, + DataFrameInfo, + frame_sub_kwargs, +) +import pandas.plotting + +if TYPE_CHECKING: + import datetime + + from pandas._libs.internals import BlockValuesRefs + from pandas._typing import ( + AggFuncType, + AnyAll, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + AxisInt, + ColspaceArgType, + CompressionOptions, + CorrelationMethod, + DropKeep, + Dtype, + DtypeObj, + FilePath, + FloatFormatType, + FormattersType, + Frequency, + FromDictOrient, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + JoinValidate, + Level, + MergeHow, + MergeValidate, + MutableMappingT, + NaAction, + NaPosition, + NsmallestNlargestKeep, + PythonFuncType, + QuantileInterpolation, + ReadBuffer, + ReindexMethod, + Renamer, + Scalar, + Self, + SequenceNotStr, + SortKind, + StorageOptions, + Suffixes, + ToGbqIfexist, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, + ValueKeyFunc, + WriteBuffer, + XMLParsers, + npt, + ) + + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.core.internals import SingleDataManager + + from pandas.io.formats.style import Styler + +# --------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + "inplace": """ + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one.""", + "optional_by": """ +by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels.""", + "optional_reindex": """ +labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. +index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. +columns : array-like, optional + New labels for the columns. Preferably an Index object to avoid + duplicating data. +axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", +} + +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. + +A named Series object is treated as a DataFrame with a single named column. + +The join is done on columns or indexes. If joining columns on +columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. + +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + +Parameters +----------%s +right : DataFrame or named Series + Object to merge with. +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. +on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. +left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. +right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. +left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. +right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. +sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). +suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. +copy : bool, default True + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` +indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + +validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + +Returns +------- +DataFrame + A DataFrame of the two merged objects. + +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. + +Examples +-------- +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 + +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', +... suffixes=('_left', '_right')) + lkey value_left rkey value_right +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 +""" + + +# ----------------------------------------------------------------------- +# DataFrame class + + +class DataFrame(NDFrame, OpsMixin): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. + + Parameters + ---------- + data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame + Dict can contain Series, arrays, constants, dataclass or list-like objects. If + data is a dict, column order follows insertion-order. If a dict contains Series + which have an index defined, it is aligned by its index. This alignment also + occurs if data is a Series or a DataFrame itself. Alignment is done on + Series/DataFrame inputs. + + If data is a list of dicts, column order follows insertion-order. + + index : Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + no indexing information part of input data and no index provided. + columns : Index or array-like + Column labels to use for resulting frame when data does not have them, + defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, + will perform column selection instead. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. + If data is a dict containing one or more Series (possibly of different dtypes), + ``copy=False`` will ensure that these inputs are not copied. + + .. versionchanged:: 1.3.0 + + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_table : Read general delimited file into DataFrame. + read_clipboard : Read text from clipboard into DataFrame. + + Notes + ----- + Please reference the :ref:`User Guide ` for more information. + + Examples + -------- + Constructing DataFrame from a dictionary. + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from a dictionary including Series: + + >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ... columns=['a', 'b', 'c']) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 + + Constructing DataFrame from a numpy ndarray that has labeled columns: + + >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) + >>> df3 = pd.DataFrame(data, columns=['c', 'a']) + ... + >>> df3 + c a + 0 3 1 + 1 6 4 + 2 9 7 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 + + Constructing DataFrame from Series/DataFrame: + + >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) + >>> df = pd.DataFrame(data=ser, index=["a", "c"]) + >>> df + 0 + a 1 + c 3 + + >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) + >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) + >>> df2 + x + a 1 + c 3 + """ + + _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: set[str] = {"sparse"} + _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: BlockManager | ArrayManager + + # similar to __array_priority__, positions DataFrame before Series, Index, + # and ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 4000 + + @property + def _constructor(self) -> Callable[..., DataFrame]: + return DataFrame + + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames + return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) + + _constructor_sliced: Callable[..., Series] = Series + + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. + return ser + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index: Axes | None = None, + columns: Axes | None = None, + dtype: Dtype | None = None, + copy: bool | None = None, + ) -> None: + allow_mgr = False + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._mgr + allow_mgr = True + if not copy: + # if not copying data, ensure to still return a shallow copy + # to avoid the result sharing the same Manager + data = data.copy(deep=False) + + if isinstance(data, (BlockManager, ArrayManager)): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + ) + + if using_copy_on_write(): + data = data.copy(deep=False) + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + + manager = _get_option("mode.data_manager", silent=True) + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + + # GH47215 + if isinstance(index, set): + raise ValueError("index cannot be a set") + if isinstance(columns, set): + raise ValueError("columns cannot be a set") + + if copy is None: + if isinstance(data, dict): + # retain pre-GH#38939 default behavior + copy = True + elif ( + manager == "array" + and isinstance(data, (np.ndarray, ExtensionArray)) + and data.ndim == 2 + ): + # INFO(ArrayManager) by default copy the 2D input array to get + # contiguous 1D arrays + copy = True + elif using_copy_on_write() and not isinstance( + data, (Index, DataFrame, Series) + ): + copy = True + else: + copy = False + + if data is None: + index = index if index is not None else default_index(0) + columns = columns if columns is not None else default_index(0) + dtype = dtype if dtype is not None else pandas_dtype(object) + data = [] + + if isinstance(data, (BlockManager, ArrayManager)): + mgr = self._init_mgr( + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + ) + + elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) + elif isinstance(data, ma.MaskedArray): + from numpy.ma import mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + raise TypeError( + "MaskedRecords are not supported. Pass " + "{name: data[name] for name in data.dtype.names} " + "instead" + ) + + # a masked array + data = sanitize_masked_array(data) + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + + elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): + if data.dtype.names: + # i.e. numpy structured array + data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + typ=manager, + ) + elif getattr(data, "name", None) is not None: + # i.e. Series/Index with non-None name + _copy = copy if using_copy_on_write() else True + mgr = dict_to_mgr( + # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no + # attribute "name" + {data.name: data}, # type: ignore[union-attr] + index, + columns, + dtype=dtype, + typ=manager, + copy=_copy, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + + # For data is list-like, or Iterable (will consume into list) + elif is_list_like(data): + if not isinstance(data, abc.Sequence): + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above + if columns is not None: + columns = ensure_index(columns) + arrays, columns, index = nested_data_to_arrays( + # error: Argument 3 to "nested_data_to_arrays" has incompatible + # type "Optional[Collection[Any]]"; expected "Optional[Index]" + data, + columns, + index, # type: ignore[arg-type] + dtype, + ) + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + typ=manager, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + else: + mgr = dict_to_mgr( + {}, + index, + columns if columns is not None else default_index(0), + dtype=dtype, + typ=manager, + ) + # For data is scalar + else: + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + index = ensure_index(index) + columns = ensure_index(columns) + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data) + + # For data is a scalar extension dtype + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): special case not needed with 2D EAs + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) + else: + arr2d = construct_2d_arraylike_from_scalar( + data, + len(index), + len(columns), + dtype, + copy, + ) + + mgr = ndarray_to_mgr( + arr2d, + index, + columns, + dtype=arr2d.dtype, + copy=False, + typ=manager, + ) + + # ensure correct Manager type according to settings + mgr = mgr_to_mgr(mgr, typ=manager) + + NDFrame.__init__(self, mgr) + + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + + # ---------------------------------------------------------------------- + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> interchange_object.column_names() + Index(['A', 'B'], dtype='object') + >>> df_pandas = (pd.api.interchange.from_dataframe + ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas + A + 0 1 + 1 2 + + These methods (``column_names``, ``select_columns_by_name``) should work + for any dataframe library which implements the interchange protocol. + """ + + from pandas.core.interchange.dataframe import PandasDataFrameXchg + + return PandasDataFrameXchg(self, allow_copy=allow_copy) + + def __dataframe_consortium_standard__( + self, *, api_version: str | None = None + ) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of pandas. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + dataframe_api_compat = import_optional_dependency("dataframe_api_compat") + convert_to_standard_compliant_dataframe = ( + dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe + ) + return convert_to_standard_compliant_dataframe(self, api_version=api_version) + + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> list[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape : Tuple of array dimensions. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], + ... 'col3': [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type + False + """ + # The "<" part of "<=" here is for empty DataFrame cases + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + if isinstance(self._mgr, ArrayManager): + return False + blocks = self._mgr.blocks + if len(blocks) != 1: + return False + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) + + @property + def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + mgr = self._mgr + + if isinstance(mgr, ArrayManager): + if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): + # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" + # has no attribute "reshape" + return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] + return ensure_wrapped_if_datetimelike(self.values) + + blocks = mgr.blocks + if len(blocks) != 1: + return ensure_wrapped_if_datetimelike(self.values) + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self.values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + return arr.T + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if width is None or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if max_rows is not None: # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(line) for line in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + return buf.getvalue() + + repr_params = fmt.get_dataframe_repr_params() + return self.to_string(**repr_params) + + def _repr_html_(self) -> str | None: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return f"
{val}
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + ) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + else: + return None + + @overload + def to_string( + self, + buf: None = ..., + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> str: + ... + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_string" + ) + @Substitution( + header_type="bool or list of str", + header="Write out the column names. If a list of columns " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int, list or dict of int", + col_space="The minimum width of each column. If a list of ints is given " + "every integers corresponds with one column. If a dict is given, the key " + "references the column, while the value defines the space to use.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + columns: Axes | None = None, + col_space: int | list[int] | dict[Hashable, int] | None = None, + header: bool | SequenceNotStr[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: fmt.FormattersType | None = None, + float_format: fmt.FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_colwidth : int, optional + Max width to truncate each column in characters. By default, no limit. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + from pandas import option_context + + with option_context("display.max_colwidth", max_colwidth): + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, + line_width=line_width, + ) + + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> Self: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] + + # ---------------------------------------------------------------------- + + @property + def style(self) -> Styler: + """ + Returns a Styler object. + + Contains methods for building a styled HTML representation of the DataFrame. + + See Also + -------- + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3]}) + >>> df.style # doctest: +SKIP + + Please see + `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. + """ + from pandas.io.formats.style import Styler + + return Styler(self) + + _shared_docs[ + "items" + ] = r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print(f'label: {label}') + ... print(f'content: {content}', sep='\n') + ... + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ + + @Appender(_shared_docs["items"]) + def items(self) -> Iterable[tuple[Hashable, Series]]: + if self.columns.is_unique and hasattr(self, "_item_cache"): + for k in self.columns: + yield k, self._get_item_cache(k) + else: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) + + def iterrows(self) -> Iterable[tuple[Hashable, Series]]: + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + + Examples + -------- + + >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row['int'].dtype) + float64 + >>> print(df['int'].dtype) + int64 + """ + columns = self.columns + klass = self._constructor_sliced + using_cow = using_copy_on_write() + for k, v in zip(self.index, self.values): + s = klass(v, index=columns, name=k).__finalize__(self) + if using_cow and self._mgr.is_single_block: + s._mgr.add_references(self._mgr) # type: ignore[arg-type] + yield k, s + + def itertuples( + self, index: bool = True, name: str | None = "Pandas" + ) -> Iterable[tuple[Any, ...]]: + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, + ... index=['dog', 'hawk']) + >>> df + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + ... + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + ... + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name='Animal'): + ... print(row) + ... + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + arrays = [] + fields = list(self.columns) + if index: + arrays.append(self.index) + fields.insert(0, "Index") + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + if name is not None: + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) + return map(itertuple._make, zip(*arrays)) + + # fallback to regular tuples + return zip(*arrays) + + def __len__(self) -> int: + """ + Returns length of info axis, but here we use the index. + """ + return len(self.index) + + @overload + def dot(self, other: Series) -> Series: + ... + + @overload + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: + ... + + def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Compute the matrix multiplication between the DataFrame and other. + + This method computes the matrix product between the DataFrame and the + values of an other Series, DataFrame or a numpy array. + + It can also be called using ``self @ other``. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the matrix product with. + + Returns + ------- + Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Series. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + + See Also + -------- + Series.dot: Similar method for Series. + + Notes + ----- + The dimensions of DataFrame and other must be compatible in order to + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. + + The dot method for Series computes the inner product, instead of the + matrix product here. + + Examples + -------- + Here we multiply a DataFrame with a Series. + + >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> s = pd.Series([1, 1, 2, 1]) + >>> df.dot(s) + 0 -4 + 1 5 + dtype: int64 + + Here we multiply a DataFrame with another DataFrame. + + >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(other) + 0 1 + 0 1 4 + 1 2 2 + + Note that the dot method give the same result as @ + + >>> df @ other + 0 1 + 0 1 4 + 1 2 2 + + The dot method works also if other is an np.array. + + >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(arr) + 0 1 + 0 1 4 + 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(columns=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right._values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, DataFrame): + common_type = find_common_type(list(self.dtypes) + list(other.dtypes)) + return self._constructor( + np.dot(lvals, rvals), + index=left.index, + columns=other.columns, + copy=False, + dtype=common_type, + ) + elif isinstance(other, Series): + common_type = find_common_type(list(self.dtypes) + [other.dtypes]) + return self._constructor_sliced( + np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type + ) + elif isinstance(rvals, (np.ndarray, Index)): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index, copy=False) + else: + return self._constructor_sliced(result, index=left.index, copy=False) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + @overload + def __matmul__(self, other: Series) -> Series: + ... + + @overload + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + ... + + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Matrix multiplication using binary `@` operator. + """ + return self.dot(other) + + def __rmatmul__(self, other) -> DataFrame: + """ + Matrix multiplication using binary `@` operator. + """ + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err + + # ---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict( + cls, + data: dict, + orient: FromDictOrient = "columns", + dtype: Dtype | None = None, + columns: Axes | None = None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index', 'tight'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + dtype : dtype, default None + Data type to force after DataFrame construction, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from structured ndarray, sequence + of tuples or dicts, or DataFrame. + DataFrame : DataFrame object creation using constructor. + DataFrame.to_dict : Convert the DataFrame to a dictionary. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d + + Specify ``orient='tight'`` to create the DataFrame using a 'tight' + format: + + >>> data = {'index': [('a', 'b'), ('a', 'c')], + ... 'columns': [('x', 1), ('y', 2)], + ... 'data': [[1, 3], [2, 4]], + ... 'index_names': ['n1', 'n2'], + ... 'column_names': ['z1', 'z2']} + >>> pd.DataFrame.from_dict(data, orient='tight') + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 + """ + index = None + orient = orient.lower() # type: ignore[assignment] + if orient == "index": + if len(data) > 0: + # TODO speed up Series case + if isinstance(next(iter(data.values())), (Series, dict)): + data = _from_nested_dict(data) + else: + index = list(data.keys()) + # error: Incompatible types in assignment (expression has type + # "List[Any]", variable has type "Dict[Any, Any]") + data = list(data.values()) # type: ignore[assignment] + elif orient in ("columns", "tight"): + if columns is not None: + raise ValueError(f"cannot use columns parameter with orient='{orient}'") + else: # pragma: no cover + raise ValueError( + f"Expected 'index', 'columns' or 'tight' for orient parameter. " + f"Got '{orient}' instead" + ) + + if orient != "tight": + return cls(data, index=index, columns=columns, dtype=dtype) + else: + realdata = data["data"] + + def create_index(indexlist, namelist): + index: Index + if len(namelist) > 1: + index = MultiIndex.from_tuples(indexlist, names=namelist) + else: + index = Index(indexlist, name=namelist[0]) + return index + + index = create_index(data["index"], data["index_names"]) + columns = create_index(data["columns"], data["column_names"]) + return cls(realdata, index=index, columns=columns, dtype=dtype) + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the DataFrame to a NumPy array. + + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the dtypes of the DataFrame columns. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogeneous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + if dtype is not None: + dtype = np.dtype(dtype) + result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) + if result.dtype is not dtype: + result = np.asarray(result, dtype=dtype) + + return result + + def _create_data_for_split_and_tight_to_dict( + self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] + ) -> list: + """ + Simple helper method to create data for to ``to_dict(orient="split")`` and + ``to_dict(orient="tight")`` to create the main output data + """ + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) + return data + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: + ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: + ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., + ) -> dict: + ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: + ... + + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "orient"], name="to_dict" + ) + def to_dict( + self, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] + index: bool = True, + ) -> MutableMappingT | list[MutableMappingT]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + into : class, default dict + The collections.abc.MutableMapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. + + .. versionadded:: 2.0.0 + + Returns + ------- + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], + ... 'col2': [0.5, 0.75]}, + ... index=['row1', 'row2']) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict('split') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict('records') + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict('index') + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + >>> df.to_dict('tight') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict('records', into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ + from pandas.core.methods.to_dict import to_dict + + return to_dict(self, orient, into=into, index=index) + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" + ) + def to_gbq( + self, + destination_table: str, + project_id: str | None = None, + chunksize: int | None = None, + reauth: bool = False, + if_exists: ToGbqIfexist = "fail", + auth_local_webserver: bool = True, + table_schema: list[dict[str, str]] | None = None, + location: str | None = None, + progress_bar: bool = True, + credentials=None, + ) -> None: + """ + Write a DataFrame to a Google BigQuery table. + + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.to_gbq`` instead. + + This function requires the `pandas-gbq package + `__. + + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. + + Parameters + ---------- + destination_table : str + Name of table to be written, in the form ``dataset.tablename``. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. + chunksize : int, optional + Number of rows to be inserted in each chunk from the dataframe. + Set to ``None`` to load the whole dataframe at once. + reauth : bool, default False + Force Google BigQuery to re-authenticate the user. This is useful + if multiple accounts are used. + if_exists : str, default 'fail' + Behavior when the destination table exists. Value can be one of: + + ``'fail'`` + If table exists raise pandas_gbq.gbq.TableCreationError. + ``'replace'`` + If table exists, drop it, recreate it, and insert data. + ``'append'`` + If table exists, insert data. Create if does not exist. + auth_local_webserver : bool, default True + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. + + .. versionchanged:: 1.5.0 + Default value is changed to ``True``. Google has deprecated the + ``auth_local_webserver = False`` `"out of band" (copy-paste) + flow + `_. + table_schema : list of dicts, optional + List of BigQuery table fields to which according DataFrame + columns conform to, e.g. ``[{'name': 'col1', 'type': + 'STRING'},...]``. If schema is not provided, it will be + generated according to dtypes of DataFrame columns. See + BigQuery API documentation on available names of a field. + + *New in version 0.3.1 of pandas-gbq*. + location : str, optional + Location where the load job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of the + target dataset. + + *New in version 0.5.0 of pandas-gbq*. + progress_bar : bool, default True + Use the library `tqdm` to show the progress bar for the upload, + chunk by chunk. + + *New in version 0.5.0 of pandas-gbq*. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to + override default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service + Account :class:`google.oauth2.service_account.Credentials` + directly. + + *New in version 0.8.0 of pandas-gbq*. + + See Also + -------- + pandas_gbq.to_gbq : This function in the pandas-gbq library. + read_gbq : Read a DataFrame from Google BigQuery. + + Examples + -------- + Example taken from `Google BigQuery documentation + `_ + + >>> project_id = "my-project" + >>> table_id = 'my_dataset.my_table' + >>> df = pd.DataFrame({ + ... "my_string": ["a", "b", "c"], + ... "my_int64": [1, 2, 3], + ... "my_float64": [4.0, 5.0, 6.0], + ... "my_bool1": [True, False, True], + ... "my_bool2": [False, True, False], + ... "my_dates": pd.date_range("now", periods=3), + ... } + ... ) + + >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP + """ + from pandas.io import gbq + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + ) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Parameters + ---------- + data : structured ndarray, sequence of tuples or dicts, or DataFrame + Structured input data. + + .. deprecated:: 2.1.0 + Passing a DataFrame is deprecated. + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_dict : DataFrame from dict of array-like or dicts. + DataFrame : DataFrame object creation using constructor. + + Examples + -------- + Data can be provided as a structured ndarray: + + >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], + ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of dicts: + + >>> data = [{'col_1': 3, 'col_2': 'a'}, + ... {'col_1': 2, 'col_2': 'b'}, + ... {'col_1': 1, 'col_2': 'c'}, + ... {'col_1': 0, 'col_2': 'd'}] + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of tuples with corresponding columns: + + >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] + >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + """ + if isinstance(data, DataFrame): + warnings.warn( + "Passing a DataFrame to DataFrame.from_records is deprecated. Use " + "set_index and/or drop to modify the DataFrame instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if columns is not None: + if is_scalar(columns): + columns = [columns] + data = data[columns] + if index is not None: + data = data.set_index(index) + if exclude is not None: + data = data.drop(columns=exclude) + return data.copy(deep=False) + + result_index = None + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = ensure_index(columns) + + def maybe_reorder( + arrays: list[ArrayLike], arr_columns: Index, columns: Index, index + ) -> tuple[list[ArrayLike], Index, Index | None]: + """ + If our desired 'columns' do not match the data's pre-existing 'arr_columns', + we re-order our arrays. This is like a pre-emptive (cheap) reindex. + """ + if len(arrays): + length = len(arrays[0]) + else: + length = 0 + + result_index = None + if len(arrays) == 0 and index is None and length == 0: + result_index = default_index(0) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + return arrays, arr_columns, result_index + + if is_iterator(data): + if nrows == 0: + return cls() + + try: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, "dtype") and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns_list = [] + for k, v in data.items(): + if k in columns: + arr_columns_list.append(k) + arrays.append(v) + + arr_columns = Index(arr_columns_list) + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + elif isinstance(data, np.ndarray): + arrays, columns = to_arrays(data, columns) + arr_columns = columns + else: + arrays, arr_columns = to_arrays(data, columns) + if coerce_float: + for i, arr in enumerate(arrays): + if arr.dtype == object: + # error: Argument 1 to "maybe_convert_objects" has + # incompatible type "Union[ExtensionArray, ndarray]"; + # expected "ndarray" + arrays[i] = lib.maybe_convert_objects( + arr, # type: ignore[arg-type] + try_float=True, + ) + + arr_columns = ensure_index(arr_columns) + if columns is None: + columns = arr_columns + else: + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + if index is not None: + if isinstance(index, str) or not hasattr(index, "__iter__"): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + except (KeyError, TypeError): + # raised by get_loc, see GH#29258 + result_index = index + else: + result_index = ensure_index_from_sequences(index_data, names=index) + exclude.update(index) + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + columns = columns.drop(exclude) + + manager = _get_option("mode.data_manager", silent=True) + mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) + + return cls._from_mgr(mgr, axes=mgr.axes) + + def to_records( + self, index: bool = True, column_dtypes=None, index_dtypes=None + ) -> np.rec.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Parameters + ---------- + index : bool, default True + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns + ------- + numpy.rec.recarray + NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + + See Also + -------- + DataFrame.from_records: Convert structured or record ndarray + to DataFrame. + numpy.rec.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' Self: + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + manager = _get_option("mode.data_manager", silent=True) + columns = ensure_index(columns) + if len(columns) != len(arrays): + raise ValueError("len(columns) must match len(arrays)") + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + verify_integrity=verify_integrity, + typ=manager, + ) + return cls._from_mgr(mgr, axes=mgr.axes) + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) + def to_stata( + self, + path: FilePath | WriteBuffer[bytes], + *, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: ToStataByteorder | None = None, + time_stamp: datetime.datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + version: int | None = 114, + convert_strl: Sequence[Hashable] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + """ + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. + + Parameters + ---------- + path : str, path object, or buffer + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. + + convert_dates : dict + Dictionary mapping columns containing datetime types to stata + internal format to use when writing the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information. + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder`. + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + data_label : str, optional + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + version : {{114, 117, 118, 119, None}}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. + + Version 119 should usually only be used when the number of + variables exceeds the capacity of dta format 118. Exporting + smaller datasets in format 119 may have unintended consequences, + and, as of November 2020, Stata SE cannot read version 119 files. + + convert_strl : list, optional + List of column names to convert to string columns to Stata StrL + format. Only available if version is 117. Storing strings in the + StrL format can produce smaller dta files if strings have more than + 8 characters and values are repeated. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + + .. versionadded:: 1.4.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + See Also + -------- + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. + + Examples + -------- + >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', + ... 'parrot'], + ... 'speed': [350, 18, 361, 15]}}) + >>> df.to_stata('animals.dta') # doctest: +SKIP + """ + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + if version == 114: + if convert_strl is not None: + raise ValueError("strl is not supported in format 114") + from pandas.io.stata import StataWriter as statawriter + elif version == 117: + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriter117 as statawriter, + ) + else: # versions 118 and 119 + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriterUTF8 as statawriter, + ) + + kwargs: dict[str, Any] = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 + kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version + + writer = statawriter( + path, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + compression=compression, + storage_options=storage_options, + value_labels=value_labels, + **kwargs, + ) + writer.write_file() + + def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: + """ + Write a DataFrame to the binary Feather format. + + Parameters + ---------- + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If a string or a path, + it will be used as Root Directory path when writing a partitioned dataset. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + This includes the `compression`, `compression_level`, `chunksize` + and `version` keywords. + + Notes + ----- + This function writes the dataframe as a `feather file + `_. Requires a default + index. For saving the DataFrame with your custom index use a method that + supports custom indices e.g. `to_parquet`. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + >>> df.to_feather("file.feather") # doctest: +SKIP + """ + from pandas.io.feather_format import to_feather + + to_feather(self, path, **kwargs) + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_markdown" + ) + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples="""Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + + Output markdown with a tabulate option. + + >>> print(df.to_markdown(tablefmt="grid")) + +----+------------+------------+ + | | animal_1 | animal_2 | + +====+============+============+ + | 0 | elk | dog | + +----+------------+------------+ + | 1 | pig | quetzal | + +----+------------+------------+""", + ) + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + if "showindex" in kwargs: + raise ValueError("Pass 'index' instead of 'showindex") + + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + + with get_handle(buf, mode, storage_options=storage_options) as handles: + handles.handle.write(result) + return None + + @overload + def to_parquet( + self, + path: None = ..., + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., + compression: str | None = ..., + index: bool | None = ..., + partition_cols: list[str] | None = ..., + storage_options: StorageOptions = ..., + **kwargs, + ) -> bytes: + ... + + @overload + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes], + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., + compression: str | None = ..., + index: bool | None = ..., + partition_cols: list[str] | None = ..., + storage_options: StorageOptions = ..., + **kwargs, + ) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_parquet" + ) + @doc(storage_options=_shared_docs["storage_options"]) + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", + compression: str | None = "snappy", + index: bool | None = None, + partition_cols: list[str] | None = None, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> bytes | None: + """ + Write a DataFrame to the binary parquet format. + + This function writes the dataframe as a `parquet file + `_. You can choose different parquet + backends, and have the option of compression. See + :ref:`the user guide ` for more details. + + Parameters + ---------- + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : str or None, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + {storage_options} + + **kwargs + Additional arguments passed to the parquet library. See + :ref:`pandas io ` for more details. + + Returns + ------- + bytes if no path argument is provided else None + + See Also + -------- + read_parquet : Read a parquet file. + DataFrame.to_orc : Write an orc file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the parquet content you can use a io.BytesIO + object, as long as you don't use partition_cols, which creates multiple files. + + >>> import io + >>> f = io.BytesIO() + >>> df.to_parquet(f) + >>> f.seek(0) + 0 + >>> content = f.read() + """ + from pandas.io.parquet import to_parquet + + return to_parquet( + self, + path, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + **kwargs, + ) + + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, + ) -> bytes | None: + """ + Write a DataFrame to the ORC format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : {'pyarrow'}, default 'pyarrow' + ORC library to use. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no path argument is provided else None + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + * Before using this function you should read the :ref:`user guide about + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + + Examples + -------- + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> df.to_orc('df.orc') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP + col1 col2 + 0 1 4 + 1 2 3 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + + >>> import io + >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP + >>> b.seek(0) # doctest: +SKIP + 0 + >>> content = b.read() # doctest: +SKIP + """ + from pandas.io.orc import to_orc + + return to_orc( + self, path, engine=engine, index=index, engine_kwargs=engine_kwargs + ) + + @overload + def to_html( + self, + buf: FilePath | WriteBuffer[str], + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> None: + ... + + @overload + def to_html( + self, + buf: None = ..., + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> str: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_html" + ) + @Substitution( + header_type="bool", + header="Whether to print column labels, default True", + col_space_type="str or int, list or dict of int or str", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + columns: Axes | None = None, + col_space: ColspaceArgType | None = None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool | str = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame as an HTML table. + %(shared_params)s + bold_rows : bool, default True + Make the row labels bold in the output. + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table. + escape : bool, default True + Convert the characters <, >, and & to HTML-safe sequences. + notebook : {True, False}, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_string : Convert DataFrame to a string. + + Examples + -------- + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> html_string = '''
+ ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ...
col1col2
014
123
''' + >>> assert html_string == df.to_html() + """ + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: + raise ValueError("Invalid value for justify parameter") + + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + justify=justify, + index_names=index_names, + escape=escape, + decimal=decimal, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + ) + # TODO: a generic formatter wld b in DataFrameFormatter + return fmt.DataFrameRenderer(formatter).to_html( + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, + table_id=table_id, + render_links=render_links, + ) + + @overload + def to_xml( + self, + path_or_buffer: None = ..., + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> str: + ... + + @overload + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" + ) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", + ) + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + index: bool = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, + encoding: str = "utf-8", + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + parser: XMLParsers | None = "lxml", + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + ) -> str | None: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``write()`` function. If None, the result is returned + as a string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{"": "https://example.com"}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : bool, default True + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], + ... 'degrees': [360, 360, 180], + ... 'sides': [4, np.nan, 3]}}) + + >>> df.to_xml() # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml(attr_cols=[ + ... 'index', 'shape', 'degrees', 'sides' + ... ]) # doctest: +SKIP + + + + + + + + >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, + ... prefix="doc") # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter] + + if parser == "lxml": + if lxml is not None: + TreeBuilder = LxmlXMLFormatter + else: + raise ImportError( + "lxml not found, please install or use the etree parser." + ) + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self, + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + + return xml_formatter.write_output() + + # ---------------------------------------------------------------------- + @doc(INFO_DOCSTRING, **frame_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool | None = None, + ) -> None: + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + + def memory_usage(self, index: bool = True, deep: bool = False) -> Series: + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. + + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Notes + ----- + See the :ref:`Frequently Asked Questions ` for more + details. + + Examples + -------- + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) + ... for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True + + >>> df.memory_usage() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True) + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 180000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True) + 5244 + """ + result = self._constructor_sliced( + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + index=self.columns, + dtype=np.intp, + ) + if index: + index_memory_usage = self._constructor_sliced( + self.index.memory_usage(deep=deep), index=["Index"] + ) + result = index_memory_usage._append(result) + return result + + def transpose(self, *args, copy: bool = False) -> DataFrame: + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + Parameters + ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. + copy : bool, default False + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. In such a case, a copy of the data + is always made. + + Examples + -------- + **Square DataFrame with homogeneous dtype** + + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = pd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object + + **Non-square DataFrame with mixed dtypes** + + >>> d2 = {'name': ['Alice', 'Bob'], + ... 'score': [9.5, 8], + ... 'employed': [False, True], + ... 'kids': [0, 0]} + >>> df2 = pd.DataFrame(data=d2) + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 + + >>> df2_transposed = df2.T # or df2.transpose() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8.0 + employed False True + kids 0 0 + + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: + + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + nv.validate_transpose(args, {}) + # construct the args + + dtypes = list(self.dtypes) + + if self._can_fast_transpose: + # Note: tests pass without this, but this improves perf quite a bit. + new_vals = self._values.T + if copy and not using_copy_on_write(): + new_vals = new_vals.copy() + + result = self._constructor( + new_vals, + index=self.columns, + columns=self.index, + copy=False, + dtype=new_vals.dtype, + ) + if using_copy_on_write() and len(self) > 0: + result._mgr.add_references(self._mgr) # type: ignore[arg-type] + + elif ( + self._is_homogeneous_type + and dtypes + and isinstance(dtypes[0], ExtensionDtype) + ): + new_values: list + if isinstance(dtypes[0], BaseMaskedDtype): + # We have masked arrays with the same dtype. We can transpose faster. + from pandas.core.arrays.masked import ( + transpose_homogeneous_masked_arrays, + ) + + new_values = transpose_homogeneous_masked_arrays( + cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) + ) + elif isinstance(dtypes[0], ArrowDtype): + # We have arrow EAs with the same dtype. We can transpose faster. + from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + transpose_homogeneous_pyarrow, + ) + + new_values = transpose_homogeneous_pyarrow( + cast(Sequence[ArrowExtensionArray], self._iter_column_arrays()) + ) + else: + # We have other EAs with the same dtype. We preserve dtype in transpose. + dtyp = dtypes[0] + arr_typ = dtyp.construct_array_type() + values = self.values + new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + + result = type(self)._from_arrays( + new_values, + index=self.columns, + columns=self.index, + verify_integrity=False, + ) + + else: + new_arr = self.values.T + if copy and not using_copy_on_write(): + new_arr = new_arr.copy() + result = self._constructor( + new_arr, + index=self.columns, + columns=self.index, + dtype=new_arr.dtype, + # We already made a copy (more than one block) + copy=False, + ) + + return result.__finalize__(self, method="transpose") + + @property + def T(self) -> DataFrame: + """ + The transpose of the DataFrame. + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + DataFrame.transpose : Transpose index and columns. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + >>> df.T + 0 1 + col1 1 2 + col2 3 4 + """ + return self.transpose() + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _ixs(self, i: int, axis: AxisInt = 0) -> Series: + """ + Parameters + ---------- + i : int + axis : int + + Returns + ------- + Series + """ + # irow + if axis == 0: + new_mgr = self._mgr.fast_xs(i) + + # if we are a copy, mark as such + copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None + result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) + result._name = self.index[i] + result = result.__finalize__(self) + result._set_is_copy(self, copy=copy) + return result + + # icol + else: + label = self.columns[i] + + col_mgr = self._mgr.iget(i) + result = self._box_col_values(col_mgr, i) + + # this is a cached value, mark it so + result._set_as_cached(label, self) + return result + + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). + """ + return self._mgr.iget_values(i) + + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). + """ + if isinstance(self._mgr, ArrayManager): + yield from self._mgr.arrays + else: + for i in range(len(self.columns)): + yield self._get_column_array(i) + + def _getitem_nocopy(self, key: list): + """ + Behaves like __getitem__, but returns a view in cases where __getitem__ + would make a copy. + """ + # TODO(CoW): can be removed if/when we are always Copy-on-Write + indexer = self.columns._get_indexer_strict(key, "columns")[1] + new_axis = self.columns[indexer] + + new_mgr = self._mgr.reindex_indexer( + new_axis, + indexer, + axis=0, + allow_dups=True, + copy=False, + only_slice=True, + ) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result + + def __getitem__(self, key): + check_dict_or_set_indexers(key) + key = lib.item_from_zerodim(key) + key = com.apply_if_callable(key, self) + + if is_hashable(key) and not is_iterator(key): + # is_iterator to exclude generator e.g. test_getitem_listlike + # shortcut if the key is in columns + is_mi = isinstance(self.columns, MultiIndex) + # GH#45316 Return view if key is not duplicated + # Only use drop_duplicates with duplicates for performance + if not is_mi and ( + self.columns.is_unique + and key in self.columns + or key in self.columns.drop_duplicates(keep=False) + ): + return self._get_item_cache(key) + + elif is_mi and self.columns.is_unique and key in self.columns: + return self._getitem_multilevel(key) + + # Do we have a slicer (on rows)? + if isinstance(key, slice): + return self._getitem_slice(key) + + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): + return self.where(key) + + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) + + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + is_single_key = isinstance(key, tuple) or not is_list_like(key) + + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] + else: + if is_iterator(key): + key = list(key) + indexer = self.columns._get_indexer_strict(key, "columns")[1] + + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] + + if isinstance(indexer, slice): + return self._slice(indexer, axis=1) + + data = self._take_with_is_copy(indexer, axis=1) + + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + # GH#26490 using data[key] can cause RecursionError + return data._get_item_cache(key) + + return data + + def _getitem_bool_array(self, key): + # also raises Exception if object array with NA values + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + UserWarning, + stacklevel=find_stack_level(), + ) + elif len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}." + ) + + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + + if key.all(): + return self.copy(deep=None) + + indexer = key.nonzero()[0] + return self._take_with_is_copy(indexer, axis=0) + + def _getitem_multilevel(self, key): + # self.columns is a MultiIndex + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_columns = self.columns[loc] + result_columns = maybe_droplevels(new_columns, key) + result = self.iloc[:, loc] + result.columns = result_columns + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. + if len(result.columns) == 1: + # e.g. test_frame_getitem_multicolumn_empty_level, + # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice + top = result.columns[0] + if isinstance(top, tuple): + top = top[0] + if top == "": + result = result[""] + if isinstance(result, Series): + result = self._constructor_sliced( + result, index=self.index, name=key + ) + + result._set_is_copy(self) + return result + else: + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) + + def _get_value(self, index, col, takeable: bool = False) -> Scalar: + """ + Quickly retrieve single value at passed column and index. + + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False + + Returns + ------- + scalar + + Notes + ----- + Assumes that both `self.index._index_as_unique` and + `self.columns._index_as_unique`; Caller is responsible for checking. + """ + if takeable: + series = self._ixs(col, axis=1) + return series._values[index] + + series = self._get_item_cache(col) + engine = self.index._engine + + if not isinstance(self.index, MultiIndex): + # CategoricalIndex: Trying to use the engine fastpath may give incorrect + # results if our categories are integers that dont match our codes + # IntervalIndex: IntervalTree has no get_loc + row = self.index.get_loc(index) + return series._values[row] + + # For MultiIndex going through engine effectively restricts us to + # same-length tuples; see test_get_set_value_no_partial_indexing + loc = engine.get_loc(index) + return series._values[loc] + + def isetitem(self, loc, value) -> None: + """ + Set the given value in the column with position `loc`. + + This is a positional analogue to ``__setitem__``. + + Parameters + ---------- + loc : int or sequence of ints + Index position for the column. + value : scalar or arraylike + Value(s) for the column. + + Notes + ----- + ``frame.isetitem(loc, value)`` is an in-place method as it will + modify the DataFrame in place (not returning a new object). In contrast to + ``frame.iloc[:, i] = value`` which will try to update the existing values in + place, ``frame.isetitem(loc, value)`` will not update the values of the column + itself in place, it will instead insert a new array. + + In cases where ``frame.columns`` is unique, this is equivalent to + ``frame[frame.columns[i]] = value``. + """ + if isinstance(value, DataFrame): + if is_integer(loc): + loc = [loc] + + if len(loc) != len(value.columns): + raise ValueError( + f"Got {len(loc)} positions but value has {len(value.columns)} " + f"columns." + ) + + for i, idx in enumerate(loc): + arraylike, refs = self._sanitize_column(value.iloc[:, i]) + self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs) + return + + arraylike, refs = self._sanitize_column(value) + self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) + + def __setitem__(self, key, value) -> None: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= 3: + warnings.warn( + _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 + ) + elif not PYPY and not using_copy_on_write(): + if sys.getrefcount(self) <= 3 and ( + warn_copy_on_write() + or ( + not warn_copy_on_write() + and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr] + ) + ): + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) + + key = com.apply_if_callable(key, self) + + # see if we can slice the rows + if isinstance(key, slice): + slc = self.index._convert_slice_indexer(key, kind="getitem") + return self._setitem_slice(slc, value) + + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: + self._setitem_frame(key, value) + elif isinstance(key, (Series, np.ndarray, list, Index)): + self._setitem_array(key, value) + elif isinstance(value, DataFrame): + self._set_item_frame_value(key, value) + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): + # Column to set is duplicated + self._setitem_array([key], value) + else: + # set column + self._set_item(key, value) + + def _setitem_slice(self, key: slice, value) -> None: + # NB: we can't just use self.loc[key] = value because that + # operates on labels and we need to operate positional for + # backwards-compat, xref GH#31469 + self._check_setitem_copy() + self.iloc[key] = value + + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com.is_bool_indexer(key): + # bool indexer is indexing along rows + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self._check_setitem_copy() + if isinstance(value, DataFrame): + # GH#39931 reindex since iloc does not align + value = value.reindex(self.index.take(indexer)) + self.iloc[indexer] = value + + else: + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace + + if isinstance(value, DataFrame): + check_key_length(self.columns, key, value) + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + self._iset_not_inplace(key, value) + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + return self._setitem_array(key, value) + + else: + self._iset_not_inplace(key, value) + + def _iset_not_inplace(self, key, value): + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + + def igetitem(obj, i: int): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): + return obj[..., i] + else: + return obj[i] + + if self.columns.is_unique: + if np.shape(value)[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = igetitem(value, i) + + else: + ilocs = self.columns.get_indexer_non_unique(key)[0] + if (ilocs < 0).any(): + # key entries not in self.columns + raise NotImplementedError + + if np.shape(value)[-1] != len(ilocs): + raise ValueError("Columns must be same length as key") + + assert np.ndim(value) <= 2 + + orig_columns = self.columns + + # Using self.iloc[:, i] = ... may set values inplace, which + # by convention we do not do in __setitem__ + try: + self.columns = Index(range(len(self.columns))) + for i, iloc in enumerate(ilocs): + self[iloc] = igetitem(value, i) + finally: + self.columns = orig_columns + + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + key = self._constructor(key, **self._construct_axes_dict(), copy=False) + + if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): + raise TypeError( + "Must pass DataFrame or 2-d ndarray with boolean values only" + ) + + self._check_setitem_copy() + self._where(-key, value, inplace=True) + + def _set_item_frame_value(self, key, value: DataFrame) -> None: + self._ensure_valid_index(value) + + # align columns + if key in self.columns: + loc = self.columns.get_loc(key) + cols = self.columns[loc] + len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols) + if len_cols != len(value.columns): + raise ValueError("Columns must be same length as key") + + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, MultiIndex) and isinstance( + loc, (slice, Series, np.ndarray, Index) + ): + cols_droplevel = maybe_droplevels(cols, key) + if len(cols_droplevel) and not cols_droplevel.equals(value.columns): + value = value.reindex(cols_droplevel, axis=1) + + for col, col_droplevel in zip(cols, cols_droplevel): + self[col] = value[col_droplevel] + return + + if is_scalar(cols): + self[cols] = value[value.columns[0]] + return + + locs: np.ndarray | list + if isinstance(loc, slice): + locs = np.arange(loc.start, loc.stop, loc.step) + elif is_scalar(loc): + locs = [loc] + else: + locs = loc.nonzero()[0] + + return self.isetitem(locs, value) + + if len(value.columns) > 1: + raise ValueError( + "Cannot set a DataFrame with multiple columns to the single " + f"column {key}" + ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) + + self[key] = value[value.columns[0]] + + def _iset_item_mgr( + self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + # when called from _set_item_mgr loc can be anything returned from get_loc + self._mgr.iset(loc, value, inplace=inplace, refs=refs) + self._clear_item_cache() + + def _set_item_mgr( + self, key, value: ArrayLike, refs: BlockValuesRefs | None = None + ) -> None: + try: + loc = self._info_axis.get_loc(key) + except KeyError: + # This item wasn't present, just insert at end + self._mgr.insert(len(self._info_axis), key, value, refs) + else: + self._iset_item_mgr(loc, value, refs=refs) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + + def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: + # We are only called from _replace_columnwise which guarantees that + # no reindex is necessary + if using_copy_on_write(): + self._iset_item_mgr( + loc, value._values, inplace=inplace, refs=value._references + ) + else: + self._iset_item_mgr(loc, value._values.copy(), inplace=True) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + + def _set_item(self, key, value) -> None: + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrames index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrames index to + ensure homogeneity. + """ + value, refs = self._sanitize_column(value) + + if ( + key in self.columns + and value.ndim == 1 + and not isinstance(value.dtype, ExtensionDtype) + ): + # broadcast across multiple columns if necessary + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)).T + refs = None + + self._set_item_mgr(key, value, refs) + + def _set_value( + self, index: IndexLabel, col, value: Scalar, takeable: bool = False + ) -> None: + """ + Put single value at passed column and index. + + Parameters + ---------- + index : Label + row label + col : Label + column label + value : scalar + takeable : bool, default False + Sets whether or not index/col interpreted as indexers + """ + try: + if takeable: + icol = col + iindex = cast(int, index) + else: + icol = self.columns.get_loc(col) + iindex = self.index.get_loc(index) + self._mgr.column_setitem(icol, iindex, value, inplace_only=True) + self._clear_item_cache() + + except (KeyError, TypeError, ValueError, LossySetitemError): + # get_loc might raise a KeyError for missing labels (falling back + # to (i)loc will do expansion of the index) + # column_setitem will do validation that may raise TypeError, + # ValueError, or LossySetitemError + # set using a non-recursive method & reset the cache + if takeable: + self.iloc[index, col] = value + else: + self.loc[index, col] = value + self._item_cache.pop(col, None) + + except InvalidIndexError as ii_err: + # GH48729: Seems like you are trying to assign a value to a + # row when only scalar options are permitted + raise InvalidIndexError( + f"You can only assign a scalar value not a {type(value)}" + ) from ii_err + + def _ensure_valid_index(self, value) -> None: + """ + Ensure that if we don't have an index, that we can create one from the + passed value. + """ + # GH5632, make sure that we are a Series convertible + if not len(self.index) and is_list_like(value) and len(value): + if not isinstance(value, DataFrame): + try: + value = Series(value) + except (ValueError, NotImplementedError, TypeError) as err: + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a Series" + ) from err + + # GH31368 preserve name of index + index_copy = value.index.copy() + if self.index.name is not None: + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) + + def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: + """ + Provide boxed values for a column. + """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] + # We get index=self.index bc values is a SingleDataManager + obj = self._constructor_sliced_from_mgr(values, axes=values.axes) + obj._name = name + return obj.__finalize__(self) + + # ---------------------------------------------------------------------- + # Lookup Caching + + def _clear_item_cache(self) -> None: + self._item_cache.clear() + + def _get_item_cache(self, item: Hashable) -> Series: + """Return the cached item, item represents a label indexer.""" + if using_copy_on_write() or warn_copy_on_write(): + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) + + cache = self._item_cache + res = cache.get(item) + if res is None: + # All places that call _get_item_cache have unique columns, + # pending resolution of GH#33047 + + loc = self.columns.get_loc(item) + res = self._ixs(loc, axis=1) + + cache[item] = res + + # for a chain + res._is_copy = self._is_copy + return res + + def _reset_cacher(self) -> None: + # no-op for DataFrame + pass + + def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: + """ + The object has called back to us saying maybe it has changed. + """ + loc = self._info_axis.get_loc(item) + arraylike = value._values + + old = self._ixs(loc, axis=1) + if old._values is value._values and inplace: + # GH#46149 avoid making unnecessary copies/block-splitting + return + + self._mgr.iset(loc, arraylike, inplace=inplace) + + # ---------------------------------------------------------------------- + # Unsorted + + @overload + def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: + ... + + @overload + def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: + ... + + @overload + def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: + ... + + def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: + """ + Query the columns of a DataFrame with a boolean expression. + + Parameters + ---------- + expr : str + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + inplace : bool + Whether to modify the DataFrame rather than creating a new one. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. + + See Also + -------- + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query('A > B') + A B C C + 4 5 2 6 + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + res = self.eval(expr, **kwargs) + + try: + result = self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + result = self[res] + + if inplace: + self._update_inplace(result) + return None + else: + return result + + @overload + def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: + ... + + @overload + def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: + ... + + def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + from pandas.core.computation.eval import eval as _eval + + inplace = validate_bool_kwarg(inplace, "inplace") + kwargs["level"] = kwargs.pop("level", 0) + 1 + index_resolvers = self._get_index_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() + resolvers = column_resolvers, index_resolvers + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers + + return _eval(expr, inplace=inplace, **kwargs) + + def select_dtypes(self, include=None, exclude=None) -> Self: + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + Parameters + ---------- + include, exclude : scalar or list-like + A selection of dtypes or strings to be included/excluded. At least + one of these parameters must be supplied. + + Returns + ------- + DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + + See Also + -------- + DataFrame.dtypes: Return Series with the data type of each column. + + Notes + ----- + * To select all *numeric* types, use ``np.number`` or ``'number'`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + * To select datetimes, use ``np.datetime64``, ``'datetime'`` or + ``'datetime64'`` + * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or + ``'timedelta64'`` + * To select Pandas categorical dtypes, use ``'category'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` + or ``'datetime64[ns, tz]'`` + + Examples + -------- + >>> df = pd.DataFrame({'a': [1, 2] * 3, + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df + a b c + 0 1 True 1.0 + 1 2 False 2.0 + 2 1 True 1.0 + 3 2 False 2.0 + 4 1 True 1.0 + 5 2 False 2.0 + + >>> df.select_dtypes(include='bool') + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + + >>> df.select_dtypes(include=['float64']) + c + 0 1.0 + 1 2.0 + 2 1.0 + 3 2.0 + 4 1.0 + 5 2.0 + + >>> df.select_dtypes(exclude=['int64']) + b c + 0 True 1.0 + 1 False 2.0 + 2 True 1.0 + 3 False 2.0 + 4 True 1.0 + 5 False 2.0 + """ + if not is_list_like(include): + include = (include,) if include is not None else () + if not is_list_like(exclude): + exclude = (exclude,) if exclude is not None else () + + selection = (frozenset(include), frozenset(exclude)) + + if not any(selection): + raise ValueError("at least one of include or exclude must be nonempty") + + # convert the myriad valid dtypes object to a single representation + def check_int_infer_dtype(dtypes): + converted_dtypes: list[type] = [] + for dtype in dtypes: + # Numpy maps int to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + converted_dtypes.append(np.int32) + converted_dtypes.append(np.int64) + elif dtype == "float" or dtype is float: + # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 + converted_dtypes.extend([np.float64, np.float32]) + else: + converted_dtypes.append(infer_dtype_from_object(dtype)) + return frozenset(converted_dtypes) + + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) + + for dtypes in (include, exclude): + invalidate_string_dtypes(dtypes) + + # can't both include AND exclude! + if not include.isdisjoint(exclude): + raise ValueError(f"include and exclude overlap on {(include & exclude)}") + + def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: + # GH 46870: BooleanDtype._is_numeric == True but should be excluded + dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype + return issubclass(dtype.type, tuple(dtypes_set)) or ( + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) + ) + + def predicate(arr: ArrayLike) -> bool: + dtype = arr.dtype + if include: + if not dtype_predicate(dtype, include): + return False + + if exclude: + if dtype_predicate(dtype, exclude): + return False + + return True + + mgr = self._mgr._get_data_subset(predicate).copy(deep=None) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] + + def insert( + self, + loc: int, + column: Hashable, + value: Scalar | AnyArrayLike, + allow_duplicates: bool | lib.NoDefault = lib.no_default, + ) -> None: + """ + Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + column : str, number, or hashable object + Label of the inserted column. + value : Scalar, Series, or array-like + Content of the inserted column. + allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. + + See Also + -------- + Index.insert : Insert new item by index. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + >>> df.insert(1, "newcol", [99, 99]) + >>> df + col1 newcol col2 + 0 1 99 3 + 1 2 99 4 + >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) + >>> df + col1 col1 newcol col2 + 0 100 1 99 3 + 1 100 2 99 4 + + Notice that pandas uses index alignment in case of `value` from type `Series`: + + >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) + >>> df + col0 col1 col1 newcol col2 + 0 NaN 100 1 99 3 + 1 5.0 100 2 99 4 + """ + if allow_duplicates is lib.no_default: + allow_duplicates = False + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + if not allow_duplicates and column in self.columns: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {column}, already exists") + if not is_integer(loc): + raise TypeError("loc must be int") + # convert non stdlib ints to satisfy typing checks + loc = int(loc) + if isinstance(value, DataFrame) and len(value.columns) > 1: + raise ValueError( + f"Expected a one-dimensional object, got a DataFrame with " + f"{len(value.columns)} columns instead." + ) + elif isinstance(value, DataFrame): + value = value.iloc[:, 0] + + value, refs = self._sanitize_column(value) + self._mgr.insert(loc, column, value, refs=refs) + + def assign(self, **kwargs) -> DataFrame: + r""" + Assign new columns to a DataFrame. + + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + Examples + -------- + >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + data = self.copy(deep=None) + + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) + return data + + def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied (or a reference is being tracked to them under CoW) + and converted into an array. + + Parameters + ---------- + value : scalar, Series, or array-like + + Returns + ------- + tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs + """ + self._ensure_valid_index(value) + + # Using a DataFrame would mean coercing values to one dtype + assert not isinstance(value, DataFrame) + if is_dict_like(value): + if not isinstance(value, Series): + value = Series(value) + return _reindex_for_setitem(value, self.index) + + if is_list_like(value): + com.require_length_match(value, self.index) + arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + if ( + isinstance(value, Index) + and value.dtype == "object" + and arr.dtype != value.dtype + ): # + # TODO: Remove kludge in sanitize_array for string mode when enforcing + # this deprecation + warnings.warn( + "Setting an Index with object dtype into a DataFrame will stop " + "inferring another dtype in a future version. Cast the Index " + "explicitly before setting it into the DataFrame.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return arr, None + + @property + def _series(self): + return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} + + # ---------------------------------------------------------------------- + # Reindexing and alignment + + def _reindex_multi( + self, axes: dict[str, Index], copy: bool, fill_value + ) -> DataFrame: + """ + We are guaranteed non-Nones in the axes. + """ + + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) + + if row_indexer is not None and col_indexer is not None: + # Fastpath. By doing two 'take's at once we avoid making an + # unnecessary copy. + # We only get here with `self._can_fast_transpose`, which (almost) + # ensures that self.values is cheap. It may be worth making this + # condition more specific. + indexer = row_indexer, col_indexer + new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) + return self._constructor( + new_values, index=new_index, columns=new_columns, copy=False + ) + else: + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value, + ) + + @Appender( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + """ + ) + @Substitution( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + extended_summary_sub=" column or", + axis_description_sub=", and 1 identifies the columns", + see_also_sub=" or columns", + ) + @Appender(NDFrame.set_axis.__doc__) + def set_axis( + self, + labels, + *, + axis: Axis = 0, + copy: bool | None = None, + ) -> DataFrame: + return super().set_axis(labels, axis=axis, copy=copy) + + @doc( + NDFrame.reindex, + klass=_shared_doc_kwargs["klass"], + optional_reindex=_shared_doc_kwargs["optional_reindex"], + ) + def reindex( + self, + labels=None, + *, + index=None, + columns=None, + axis: Axis | None = None, + method: ReindexMethod | None = None, + copy: bool | None = None, + level: Level | None = None, + fill_value: Scalar | None = np.nan, + limit: int | None = None, + tolerance=None, + ) -> DataFrame: + return super().reindex( + labels=labels, + index=index, + columns=columns, + axis=axis, + method=method, + copy=copy, + level=level, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + ) + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level = ..., + inplace: Literal[True], + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level = ..., + inplace: Literal[False] = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level = ..., + inplace: bool = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: + ... + + def drop( + self, + labels: IndexLabel | None = None, + *, + axis: Axis = 0, + index: IndexLabel | None = None, + columns: IndexLabel | None = None, + level: Level | None = None, + inplace: bool = False, + errors: IgnoreRaise = "raise", + ) -> DataFrame | None: + """ + Drop specified labels from rows or columns. + + Remove rows or columns by specifying label names and corresponding + axis, or by directly specifying index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. See the :ref:`user guide ` + for more information about the now unused levels. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. A tuple will be used as a single + label and not treated as a list-like. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + in place and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + DataFrame or None + Returns DataFrame or None DataFrame with the specified + index or column labels removed or None if inplace=True. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis. + + See Also + -------- + DataFrame.loc : Label-location based indexer for selection by label. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. + Series.drop : Return Series with specified index labels removed. + + Examples + -------- + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + Drop a row by index + + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + Drop a specific index combination from the MultiIndex + DataFrame, i.e., drop the combination ``'falcon'`` and + ``'weight'``, which deletes only the corresponding row + + >>> df.drop(index=('falcon', 'weight')) + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + length 0.3 0.2 + + >>> df.drop(index='cow', columns='small') + big + llama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + >>> df.drop(index='length', level=1) + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + """ + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: Literal[True], + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: Literal[False] = ..., + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: + ... + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: bool = ..., + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: + ... + + def rename( + self, + mapper: Renamer | None = None, + *, + index: Renamer | None = None, + columns: Renamer | None = None, + axis: Axis | None = None, + copy: bool | None = None, + inplace: bool = False, + level: Level | None = None, + errors: IgnoreRaise = "ignore", + ) -> DataFrame | None: + """ + Rename columns or index labels. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + See the :ref:`user guide ` for more. + + Parameters + ---------- + mapper : dict-like or function + Dict-like or function transformations to apply to + that axis' values. Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` and + ``columns``. + index : dict-like or function + Alternative to specifying axis (``mapper, axis=0`` + is equivalent to ``index=mapper``). + columns : dict-like or function + Alternative to specifying axis (``mapper, axis=1`` + is equivalent to ``columns=mapper``). + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to target with ``mapper``. Can be either the axis name + ('index', 'columns') or number (0, 1). The default is 'index'. + copy : bool, default True + Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + If True then value of copy is ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. + + Returns + ------- + DataFrame or None + DataFrame with the renamed axis labels or None if ``inplace=True``. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + + See Also + -------- + DataFrame.rename_axis : Set the name of the axis. + + Examples + -------- + ``DataFrame.rename`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Rename columns using a mapping: + + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 + + Rename index using a mapping: + + >>> df.rename(index={0: "x", 1: "y", 2: "z"}) + A B + x 1 4 + y 2 5 + z 3 6 + + Cast index labels to a different type: + + >>> df.index + RangeIndex(start=0, stop=3, step=1) + >>> df.rename(index=str).index + Index(['0', '1', '2'], dtype='object') + + >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") + Traceback (most recent call last): + KeyError: ['C'] not found in axis + + Using axis-style parameters: + + >>> df.rename(str.lower, axis='columns') + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis='index') + A B + 0 1 4 + 2 2 5 + 4 3 6 + """ + return super()._rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + + def pop(self, item: Hashable) -> Series: + """ + Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : label + Label of column to be popped. + + Returns + ------- + Series + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class') + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN + """ + return super().pop(item=item) + + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex + ): + """ + Dispatch to Series.replace column-wise. + + Parameters + ---------- + mapping : dict + of the form {col: (target, value)} + inplace : bool + regex : bool or same types as `to_replace` in DataFrame.replace + + Returns + ------- + DataFrame or None + """ + # Operate column-wise + res = self if inplace else self.copy(deep=None) + ax = self.columns + + for i, ax_value in enumerate(ax): + if ax_value in mapping: + ser = self.iloc[:, i] + + target, value = mapping[ax_value] + newobj = ser.replace(target, value, regex=regex) + + res._iset_item(i, newobj, inplace=inplace) + + if inplace: + return + return res.__finalize__(self) + + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) + def shift( + self, + periods: int | Sequence[int] = 1, + freq: Frequency | None = None, + axis: Axis = 0, + fill_value: Hashable = lib.no_default, + suffix: str | None = None, + ) -> DataFrame: + if freq is not None and fill_value is not lib.no_default: + # GH#53832 + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + fill_value = lib.no_default + + if self.empty: + return self.copy() + + axis = self._get_axis_number(axis) + + if is_list_like(periods): + periods = cast(Sequence, periods) + if axis == 1: + raise ValueError( + "If `periods` contains multiple shifts, `axis` cannot be 1." + ) + if len(periods) == 0: + raise ValueError("If `periods` is an iterable, it cannot be empty.") + from pandas.core.reshape.concat import concat + + shifted_dataframes = [] + for period in periods: + if not is_integer(period): + raise TypeError( + f"Periods must be integer, but {period} is {type(period)}." + ) + period = cast(int, period) + shifted_dataframes.append( + super() + .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value) + .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}") + ) + return concat(shifted_dataframes, axis=1) + elif suffix: + raise ValueError("Cannot specify `suffix` if `periods` is an int.") + periods = cast(int, periods) + + ncols = len(self.columns) + arrays = self._mgr.arrays + if axis == 1 and periods != 0 and ncols > 0 and freq is None: + if fill_value is lib.no_default: + # We will infer fill_value to match the closest column + + # Use a column that we know is valid for our column's dtype GH#38434 + label = self.columns[0] + + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, label, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), label, filler, allow_duplicates=True + ) + + result.columns = self.columns.copy() + return result + elif len(arrays) > 1 or ( + # If we only have one block and we know that we can't + # keep the same dtype (i.e. the _can_hold_element check) + # then we can go through the reindex_indexer path + # (and avoid casting logic in the Block method). + not can_hold_element(arrays[0], fill_value) + ): + # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default + nper = abs(periods) + nper = min(nper, ncols) + if periods > 0: + indexer = np.array( + [-1] * nper + list(range(ncols - periods)), dtype=np.intp + ) + else: + indexer = np.array( + list(range(nper, ncols)) + [-1] * nper, dtype=np.intp + ) + mgr = self._mgr.reindex_indexer( + self.columns, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + ) + res_df = self._constructor_from_mgr(mgr, axes=mgr.axes) + return res_df.__finalize__(self, method="shift") + else: + return self.T.shift(periods=periods, fill_value=fill_value).T + + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) + + @overload + def set_index( + self, + keys, + *, + drop: bool = ..., + append: bool = ..., + inplace: Literal[False] = ..., + verify_integrity: bool = ..., + ) -> DataFrame: + ... + + @overload + def set_index( + self, + keys, + *, + drop: bool = ..., + append: bool = ..., + inplace: Literal[True], + verify_integrity: bool = ..., + ) -> None: + ... + + def set_index( + self, + keys, + *, + drop: bool = True, + append: bool = False, + inplace: bool = False, + verify_integrity: bool = False, + ) -> DataFrame | None: + """ + Set the DataFrame index using existing columns. + + Set the DataFrame index (row labels) using one or more existing + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. + + Parameters + ---------- + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`~collections.abc.Iterator`. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. + + Returns + ------- + DataFrame or None + Changed row labels or None if ``inplace=True``. + + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 + + Set the index to become the 'month' column: + + >>> df.set_index('month') + year sale + month + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + Create a MultiIndex using columns 'year' and 'month': + + >>> df.set_index(['year', 'month']) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + Create a MultiIndex using an Index and a column: + + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + month sale + year + 1 2012 1 55 + 2 2014 4 40 + 3 2013 7 84 + 4 2014 10 31 + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if not isinstance(keys, list): + keys = [keys] + + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) + + missing: list[Hashable] = [] + for col in keys: + if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, "ndim", 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError as err: + raise TypeError( + f"{err_msg}. Received column of type {type(col)}" + ) from err + else: + if not found: + missing.append(col) + + if missing: + raise KeyError(f"None of {missing} are in the columns") + + if inplace: + frame = self + else: + # GH 49473 Use "lazy copy" with Copy-on-Write + frame = self.copy(deep=None) + + arrays: list[Index] = [] + names: list[Hashable] = [] + if append: + names = list(self.index.names) + if isinstance(self.index, MultiIndex): + arrays.extend( + self.index._get_level_values(i) for i in range(self.index.nlevels) + ) + else: + arrays.append(self.index) + + to_remove: list[Hashable] = [] + for col in keys: + if isinstance(col, MultiIndex): + arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) + names.extend(col.names) + elif isinstance(col, (Index, Series)): + # if Index then not MultiIndex (treated above) + + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[Index, Series]"; expected "Index" + arrays.append(col) # type: ignore[arg-type] + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[List[Any], ndarray]"; expected "Index" + arrays.append(col) # type: ignore[arg-type] + names.append(None) + elif isinstance(col, abc.Iterator): + # error: Argument 1 to "append" of "list" has incompatible type + # "List[Any]"; expected "Index" + arrays.append(list(col)) # type: ignore[arg-type] + names.append(None) + # from here, col can only be a column label + else: + arrays.append(frame[col]) + names.append(col) + if drop: + to_remove.append(col) + + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError( + f"Length mismatch: Expected {len(self)} rows, " + f"received array of length {len(arrays[-1])}" + ) + + index = ensure_index_from_sequences(arrays, names) + + if verify_integrity and not index.is_unique: + duplicates = index[index.duplicated()].unique() + raise ValueError(f"Index has duplicate keys: {duplicates}") + + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): + del frame[c] + + # clear up memory usage + index._cleanup() + + frame.index = index + + if not inplace: + return frame + return None + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> None: + ... + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: bool = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame | None: + ... + + def reset_index( + self, + level: IndexLabel | None = None, + *, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: bool | lib.NoDefault = lib.no_default, + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame | None: + """ + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + col_level : int or str, default 0 + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill : object, default '' + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 + + names : int, str or 1-dimensional list, default None + Using the given string, rename the DataFrame column which contains the + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal NaN + + When we reset the index, the old index is added as a column, and a + new sequential index is used: + + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + We can use the `drop` parameter to avoid the old index being added as + a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal NaN + + You can also use `reset_index` with `MultiIndex`. + + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('species', 'type')]) + >>> df = pd.DataFrame([(389.0, 'fly'), + ... (24.0, 'fly'), + ... (80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed species + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump + + Using the `names` parameter, choose a name for the index column: + + >>> df.reset_index(names=['classes', 'names']) + classes names speed species + max type + 0 bird falcon 389.0 fly + 1 bird parrot 24.0 fly + 2 mammal lion 80.5 run + 3 mammal monkey NaN jump + + If the index has multiple levels, we can reset a subset of them: + + >>> df.reset_index(level='class') + class speed species + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: + + >>> df.reset_index(level='class', col_level=1) + speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`: + + >>> df.reset_index(level='class', col_level=1, col_fill='species') + species speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we specify a nonexistent level for `col_fill`, it is created: + + >>> df.reset_index(level='class', col_level=1, col_fill='genus') + genus speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + """ + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if inplace: + new_obj = self + else: + new_obj = self.copy(deep=None) + if allow_duplicates is not lib.no_default: + allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") + + new_index = default_index(len(new_obj)) + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) + + if not drop: + to_insert: Iterable[tuple[Any, Any | None]] + + default = "index" if "index" not in self else "level_0" + names = self.index._get_default_index_names(names, default) + + if isinstance(self.index, MultiIndex): + to_insert = zip(self.index.levels, self.index.codes) + else: + to_insert = ((self.index, None),) + + multi_col = isinstance(self.columns, MultiIndex) + for i, (lev, lab) in reversed(list(enumerate(to_insert))): + if level is not None and i not in level: + continue + name = names[i] + if multi_col: + col_name = list(name) if isinstance(name, tuple) else [name] + if col_fill is None: + if len(col_name) not in (1, self.columns.nlevels): + raise ValueError( + "col_fill=None is incompatible " + f"with incomplete column name {name}" + ) + col_fill = col_name[0] + + lev_num = self.columns._get_level_number(col_level) + name_lst = [col_fill] * lev_num + col_name + missing = self.columns.nlevels - len(name_lst) + name_lst += [col_fill] * missing + name = tuple(name_lst) + + # to ndarray and maybe infer different dtype + level_values = lev._values + if level_values.dtype == np.object_: + level_values = lib.maybe_convert_objects(level_values) + + if lab is not None: + # if we have the codes, extract the values with a mask + level_values = algorithms.take( + level_values, lab, allow_fill=True, fill_value=lev._na_value + ) + + new_obj.insert( + 0, + name, + level_values, + allow_duplicates=allow_duplicates, + ) + + new_obj.index = new_index + if not inplace: + return new_obj + + return None + + # ---------------------------------------------------------------------- + # Reindex-based selection methods + + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) + def isna(self) -> DataFrame: + res_mgr = self._mgr.isna(func=isna) + result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes) + return result.__finalize__(self, method="isna") + + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) + def isnull(self) -> DataFrame: + """ + DataFrame.isnull is an alias for DataFrame.isna. + """ + return self.isna() + + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notna(self) -> DataFrame: + return ~self.isna() + + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notnull(self) -> DataFrame: + """ + DataFrame.notnull is an alias for DataFrame.notna. + """ + return ~self.isna() + + @overload + def dropna( + self, + *, + axis: Axis = ..., + how: AnyAll | lib.NoDefault = ..., + thresh: int | lib.NoDefault = ..., + subset: IndexLabel = ..., + inplace: Literal[False] = ..., + ignore_index: bool = ..., + ) -> DataFrame: + ... + + @overload + def dropna( + self, + *, + axis: Axis = ..., + how: AnyAll | lib.NoDefault = ..., + thresh: int | lib.NoDefault = ..., + subset: IndexLabel = ..., + inplace: Literal[True], + ignore_index: bool = ..., + ) -> None: + ... + + def dropna( + self, + *, + axis: Axis = 0, + how: AnyAll | lib.NoDefault = lib.no_default, + thresh: int | lib.NoDefault = lib.no_default, + subset: IndexLabel | None = None, + inplace: bool = False, + ignore_index: bool = False, + ) -> DataFrame | None: + """ + Remove missing values. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + + thresh : int, optional + Require that many non-NA values. Cannot be combined with how. + subset : column label or sequence of labels, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 2.0.0 + + Returns + ------- + DataFrame or None + DataFrame with NA entries dropped from it or None if ``inplace=True``. + + See Also + -------- + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), + ... pd.NaT]}) + >>> df + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Drop the rows where at least one element is missing. + + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 + + Drop the columns where at least one element is missing. + + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman + + Drop the rows where all elements are missing. + + >>> df.dropna(how='all') + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep only the rows with at least 2 non-NA values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'toy']) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + """ + if (how is not lib.no_default) and (thresh is not lib.no_default): + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) + + if how is lib.no_default: + how = "any" + + inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(axis, (tuple, list)): + # GH20987 + raise TypeError("supplying multiple axes to axis is no longer supported.") + + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + # subset needs to be list + if not is_list_like(subset): + subset = [subset] + ax = self._get_axis(agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(np.array(subset)[check].tolist()) + agg_obj = self.take(indices, axis=agg_axis) + + if thresh is not lib.no_default: + count = agg_obj.count(axis=agg_axis) + mask = count >= thresh + elif how == "any": + # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' + mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) + elif how == "all": + # faster equivalent to 'agg_obj.count(agg_axis) > 0' + mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) + else: + raise ValueError(f"invalid how option: {how}") + + if np.all(mask): + result = self.copy(deep=None) + else: + result = self.loc(axis=axis)[mask] + + if ignore_index: + result.index = default_index(len(result)) + + if not inplace: + return result + self._update_inplace(result) + return None + + @overload + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: Literal[True], + ignore_index: bool = ..., + ) -> None: + ... + + @overload + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: Literal[False] = ..., + ignore_index: bool = ..., + ) -> DataFrame: + ... + + @overload + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: bool = ..., + ignore_index: bool = ..., + ) -> DataFrame | None: + ... + + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = None, + *, + keep: DropKeep = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> DataFrame | None: + """ + Return DataFrame with duplicate rows removed. + + Considering certain columns is optional. Indexes, including time indexes + are ignored. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', ``False``}, default 'first' + Determines which duplicates (if any) to keep. + + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. + + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based on all columns. + + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + To remove duplicates on specific column(s), use ``subset``. + + >>> df.drop_duplicates(subset=['brand']) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurrences, use ``keep``. + + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + """ + if self.empty: + return self.copy(deep=None) + + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + + result = self[-self.duplicated(subset, keep=keep)] + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + self._update_inplace(result) + return None + else: + return result + + def duplicated( + self, + subset: Hashable | Sequence[Hashable] | None = None, + keep: DropKeep = "first", + ) -> Series: + """ + Return boolean Series denoting duplicate rows. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to mark. + + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + Boolean series for each duplicated rows. + + See Also + -------- + Index.duplicated : Equivalent method on index. + Series.duplicated : Equivalent method on Series. + Series.drop_duplicates : Remove duplicate values from Series. + DataFrame.drop_duplicates : Remove duplicate values from DataFrame. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, for each set of duplicated values, the first occurrence + is set on False and all others on True. + + >>> df.duplicated() + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True. + + >>> df.duplicated(keep='last') + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + By setting ``keep`` on False, all duplicates are True. + + >>> df.duplicated(keep=False) + 0 True + 1 True + 2 False + 3 False + 4 False + dtype: bool + + To find duplicates on specific column(s), use ``subset``. + + >>> df.duplicated(subset=['brand']) + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + """ + + if self.empty: + return self._constructor_sliced(dtype=bool) + + def f(vals) -> tuple[np.ndarray, int]: + labels, shape = algorithms.factorize(vals, size_hint=len(self)) + return labels.astype("i8", copy=False), len(shape) + + if subset is None: + # https://github.com/pandas-dev/pandas/issues/28770 + # Incompatible types in assignment (expression has type "Index", variable + # has type "Sequence[Any]") + subset = self.columns # type: ignore[assignment] + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) + + # needed for mypy since can't narrow types using np.iterable + subset = cast(Sequence, subset) + + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = set(subset) - set(self.columns) + if diff: + raise KeyError(Index(diff)) + + if len(subset) == 1 and self.columns.is_unique: + # GH#45236 This is faster than get_group_index below + result = self[subset[0]].duplicated(keep) + result.name = None + else: + vals = (col.values for name, col in self.items() if name in subset) + labels, shape = map(list, zip(*map(f, vals))) + + ids = get_group_index(labels, tuple(shape), sort=False, xnull=False) + result = self._constructor_sliced(duplicated(ids, keep), index=self.index) + return result.__finalize__(self, method="duplicated") + + # ---------------------------------------------------------------------- + # Sorting + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def sort_values( + self, + by: IndexLabel, + *, + axis: Axis = ..., + ascending=..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> DataFrame: + ... + + @overload + def sort_values( + self, + by: IndexLabel, + *, + axis: Axis = ..., + ascending=..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: str = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> None: + ... + + def sort_values( + self, + by: IndexLabel, + *, + axis: Axis = 0, + ascending: bool | list[bool] | tuple[bool, ...] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: str = "last", + ignore_index: bool = False, + key: ValueKeyFunc | None = None, + ) -> DataFrame | None: + """ + Sort by the values along either axis. + + Parameters + ---------- + by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels. + axis : "{0 or 'index', 1 or 'columns'}", default 0 + Axis to be sorted. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of + the by. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the + end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + + Returns + ------- + DataFrame or None + DataFrame with sorted values or None if ``inplace=True``. + + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by the index. + Series.sort_values : Similar method for a Series. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] + ... }) + >>> df + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + Sort by col1 + + >>> df.sort_values(by=['col1']) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D + + Sort by multiple columns + + >>> df.sort_values(by=['col1', 'col2']) + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D + + Sort Descending + + >>> df.sort_values(by='col1', ascending=False) + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 NaN 8 4 D + + Putting NAs first + + >>> df.sort_values(by='col1', ascending=False, na_position='first') + col1 col2 col3 col4 + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + + Sorting with a key function + + >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + Natural sort with the key argument, + using the `natsort ` package. + + >>> df = pd.DataFrame({ + ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], + ... "value": [10, 20, 30, 40, 50] + ... }) + >>> df + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + ascending = validate_ascending(ascending) + if not isinstance(by, list): + by = [by] + # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; + # expected "Sized" + if is_sequence(ascending) and ( + len(by) != len(ascending) # type: ignore[arg-type] + ): + # error: Argument 1 to "len" has incompatible type "Union[bool, + # List[bool]]"; expected "Sized" + raise ValueError( + f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] + f" != length of by ({len(by)})" + ) + if len(by) > 1: + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + + # need to rewrap columns in Series to apply key function + if key is not None: + # error: List comprehension has incompatible type List[Series]; + # expected List[ndarray] + keys = [ + Series(k, name=name) # type: ignore[misc] + for (k, name) in zip(keys, by) + ] + + indexer = lexsort_indexer( + keys, orders=ascending, na_position=na_position, key=key + ) + elif len(by): + # len(by) == 1 + + k = self._get_label_or_level_values(by[0], axis=axis) + + # need to rewrap column in Series to apply key function + if key is not None: + # error: Incompatible types in assignment (expression has type + # "Series", variable has type "ndarray") + k = Series(k, name=by[0]) # type: ignore[assignment] + + if isinstance(ascending, (tuple, list)): + ascending = ascending[0] + + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position, key=key + ) + else: + if inplace: + return self._update_inplace(self) + else: + return self.copy(deep=None) + + if is_range_indexer(indexer, len(indexer)): + result = self.copy(deep=(not inplace and not using_copy_on_write())) + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + return self._update_inplace(result) + else: + return result + + new_data = self._mgr.take( + indexer, axis=self._get_block_manager_axis(axis), verify=False + ) + + if ignore_index: + new_data.set_axis( + self._get_block_manager_axis(axis), default_index(len(indexer)) + ) + + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_values") + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame | None: + ... + + def sort_index( + self, + *, + axis: Axis = 0, + level: IndexLabel | None = None, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + sort_remaining: bool = True, + ignore_index: bool = False, + key: IndexKeyFunc | None = None, + ) -> DataFrame | None: + """ + Sort object by labels (along an axis). + + Returns a new DataFrame sorted by label if `inplace` argument is + ``False``, otherwise updates the original DataFrame and returns None. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool or list-like of bools, default True + Sort ascending vs. descending. When the index is a MultiIndex the + sort direction can be controlled for each level individually. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. For MultiIndex + inputs, the key is applied *per level*. + + Returns + ------- + DataFrame or None + The original DataFrame sorted by the labels or None if ``inplace=True``. + + See Also + -------- + Series.sort_index : Sort Series by the index. + DataFrame.sort_values : Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. + + Examples + -------- + >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], + ... columns=['A']) + >>> df.sort_index() + A + 1 4 + 29 2 + 100 1 + 150 5 + 234 3 + + By default, it sorts in ascending order, to sort in descending order, + use ``ascending=False`` + + >>> df.sort_index(ascending=False) + A + 234 3 + 150 5 + 100 1 + 29 2 + 1 4 + + A key function can be specified which is applied to the index before + sorting. For a ``MultiIndex`` this is applied to each level separately. + + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df.sort_index(key=lambda x: x.str.lower()) + a + A 1 + b 2 + C 3 + d 4 + """ + return super().sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, + ) + + def value_counts( + self, + subset: IndexLabel | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Series: + """ + Return a Series containing the frequency of each distinct row in the Dataframe. + + Parameters + ---------- + subset : label or list of labels, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies when True. Sort by DataFrame column values when False. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series + + See Also + -------- + Series.value_counts: Equivalent method on Series. + + Notes + ----- + The returned Series will have a MultiIndex with one level per input + column but an Index (non-multi) for a single label. By default, rows + that contain any NA values are omitted from the result. By default, + the resulting Series will be in descending order so that the first + element is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + Name: count, dtype: int64 + + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + Name: count, dtype: int64 + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + Name: count, dtype: int64 + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.50 + 2 2 0.25 + 6 0 0.25 + Name: proportion, dtype: float64 + + With `dropna` set to `False` we can also count rows with NA values. + + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], + ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df + first_name middle_name + 0 John Smith + 1 Anne + 2 John + 3 Beth Louise + + >>> df.value_counts() + first_name middle_name + Beth Louise 1 + John Smith 1 + Name: count, dtype: int64 + + >>> df.value_counts(dropna=False) + first_name middle_name + Anne NaN 1 + Beth Louise 1 + John Smith 1 + NaN 1 + Name: count, dtype: int64 + + >>> df.value_counts("first_name") + first_name + John 2 + Anne 1 + Beth 1 + Name: count, dtype: int64 + """ + if subset is None: + subset = self.columns.tolist() + + name = "proportion" if normalize else "count" + counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() + counts.name = name + + if sort: + counts = counts.sort_values(ascending=ascending) + if normalize: + counts /= counts.sum() + + # Force MultiIndex for a list_like subset with a single column + if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type] + counts.index = MultiIndex.from_arrays( + [counts.index], names=[counts.index.name] + ) + + return counts + + def nlargest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in descending order. + + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. + + Returns + ------- + DataFrame + The first `n` rows ordered by the given columns in descending + order. + + See Also + -------- + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "population". + + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the smallest element, all the + ties are kept: + + >>> df.nlargest(3, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + However, ``nlargest`` does not keep ``n`` distinct largest elements: + + >>> df.nlargest(5, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. + + >>> df.nlargest(3, ['population', 'GDP']) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + """ + return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + + def nsmallest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of items to retrieve. + columns : list or str + Column name or names to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + - ``all`` : keep all the ties of the largest item even if it means + selecting more than ``n`` items. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 337000, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 337000 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nsmallest`` to select the + three rows having the smallest values in column "population". + + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 337000 182 NR + + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the largest element, all the + ties are kept. + + >>> df.nsmallest(3, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR + + However, ``nsmallest`` does not keep ``n`` distinct + smallest elements: + + >>> df.nsmallest(4, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR + + To order by the smallest values in column "population" and then "GDP", we can + specify multiple columns like in the next example. + + >>> df.nsmallest(3, ['population', 'GDP']) + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Nauru 337000 182 NR + """ + return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() + + @doc( + Series.swaplevel, + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise.""" + ), + examples=dedent( + """\ + Examples + -------- + >>> df = pd.DataFrame( + ... {"Grade": ["A", "B", "A", "C"]}, + ... index=[ + ... ["Final exam", "Final exam", "Coursework", "Coursework"], + ... ["History", "Geography", "History", "Geography"], + ... ["January", "February", "March", "April"], + ... ], + ... ) + >>> df + Grade + Final exam History January A + Geography February B + Coursework History March A + Geography April C + + In the following example, we will swap the levels of the indices. + Here, we will swap the levels column-wise, but levels can be swapped row-wise + in a similar manner. Note that column-wise is the default behaviour. + By not supplying any arguments for i and j, we swap the last and second to + last indices. + + >>> df.swaplevel() + Grade + Final exam January History A + February Geography B + Coursework March History A + April Geography C + + By supplying one argument, we can choose which index to swap the last + index with. We can for example swap the first index with the last one as + follows. + + >>> df.swaplevel(0) + Grade + January History Final exam A + February Geography Final exam B + March History Coursework A + April Geography Coursework C + + We can also define explicitly which indices we want to swap by supplying values + for both i and j. Here, we for example swap the first and second indices. + + >>> df.swaplevel(0, 1) + Grade + History Final exam January A + Geography Final exam February B + History Coursework March A + Geography Coursework April C""" + ), + ) + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + result = self.copy(deep=None) + + axis = self._get_axis_number(axis) + + if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only swap levels on a hierarchical axis.") + + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.swaplevel(i, j) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.swaplevel(i, j) + return result + + def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: + """ + Rearrange index levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + axis : {0 or 'index', 1 or 'columns'}, default 0 + Where to reorder levels. + + Returns + ------- + DataFrame + + Examples + -------- + >>> data = { + ... "class": ["Mammals", "Mammals", "Reptiles"], + ... "diet": ["Omnivore", "Carnivore", "Carnivore"], + ... "species": ["Humans", "Dogs", "Snakes"], + ... } + >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) + >>> df = df.set_index(["class", "diet"]) + >>> df + species + class diet + Mammals Omnivore Humans + Carnivore Dogs + Reptiles Carnivore Snakes + + Let's reorder the levels of the index: + + >>> df.reorder_levels(["diet", "class"]) + species + diet class + Omnivore Mammals Humans + Carnivore Mammals Dogs + Reptiles Snakes + """ + axis = self._get_axis_number(axis) + if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") + + result = self.copy(deep=None) + + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.reorder_levels(order) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.reorder_levels(order) + return result + + # ---------------------------------------------------------------------- + # Arithmetic Methods + + def _cmp_method(self, other, op): + axis: Literal[1] = 1 # only relevant for Series other case + + self, other = self._align_for_op(other, axis, flex=False, level=None) + + # See GH#4537 for discussion of scalar op behavior + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + def _arith_method(self, other, op): + if self._should_reindex_frame_op(other, op, 1, None, None): + return self._arith_method_with_reindex(other, op) + + axis: Literal[1] = 1 # only relevant for Series other case + other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) + + self, other = self._align_for_op(other, axis, flex=True, level=None) + + with np.errstate(all="ignore"): + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + _logical_method = _arith_method + + def _dispatch_frame_op( + self, right, func: Callable, axis: AxisInt | None = None + ) -> DataFrame: + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + right : scalar, Series, or DataFrame + func : arithmetic or comparison operator + axis : {None, 0, 1} + + Returns + ------- + DataFrame + + Notes + ----- + Caller is responsible for setting np.errstate where relevant. + """ + # Get the appropriate array-op to apply to each column/block's values. + array_op = ops.get_array_op(func) + + right = lib.item_from_zerodim(right) + if not is_list_like(right): + # i.e. scalar, faster than checking np.ndim(right) == 0 + bm = self._mgr.apply(array_op, right=right) + return self._constructor_from_mgr(bm, axes=bm.axes) + + elif isinstance(right, DataFrame): + assert self.index.equals(right.index) + assert self.columns.equals(right.columns) + # TODO: The previous assertion `assert right._indexed_same(self)` + # fails in cases with empty columns reached via + # _frame_arith_method_with_reindex + + # TODO operate_blockwise expects a manager of the same type + bm = self._mgr.operate_blockwise( + # error: Argument 1 to "operate_blockwise" of "ArrayManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "ArrayManager" + # error: Argument 1 to "operate_blockwise" of "BlockManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "BlockManager" + right._mgr, # type: ignore[arg-type] + array_op, + ) + return self._constructor_from_mgr(bm, axes=bm.axes) + + elif isinstance(right, Series) and axis == 1: + # axis=1 means we want to operate row-by-row + assert right.index.equals(self.columns) + + right = right._values + # maybe_align_as_frame ensures we do not have an ndarray here + assert not isinstance(right, np.ndarray) + + arrays = [ + array_op(_left, _right) + for _left, _right in zip(self._iter_column_arrays(), right) + ] + + elif isinstance(right, Series): + assert right.index.equals(self.index) + right = right._values + + arrays = [array_op(left, right) for left in self._iter_column_arrays()] + + else: + raise NotImplementedError(right) + + return type(self)._from_arrays( + arrays, self.columns, self.index, verify_integrity=False + ) + + def _combine_frame(self, other: DataFrame, func, fill_value=None): + # at this point we have `self._indexed_same(other)` + + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) + + new_data = self._dispatch_frame_op(other, _arith_op) + return new_data + + def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: + """ + For DataFrame-with-DataFrame operations that require reindexing, + operate only on shared columns, then reindex. + + Parameters + ---------- + right : DataFrame + op : binary operator + + Returns + ------- + DataFrame + """ + left = self + + # GH#31623, only operate on shared columns + cols, lcols, rcols = left.columns.join( + right.columns, how="inner", level=None, return_indexers=True + ) + + new_left = left.iloc[:, lcols] + new_right = right.iloc[:, rcols] + result = op(new_left, new_right) + + # Do the join on the columns instead of using left._align_for_op + # to avoid constructing two potentially large/sparse DataFrames + join_columns, _, _ = left.columns.join( + right.columns, how="outer", level=None, return_indexers=True + ) + + if result.columns.has_duplicates: + # Avoid reindexing with a duplicate axis. + # https://github.com/pandas-dev/pandas/issues/35194 + indexer, _ = result.columns.get_indexer_non_unique(join_columns) + indexer = algorithms.unique1d(indexer) + result = result._reindex_with_indexers( + {1: [join_columns, indexer]}, allow_dups=True + ) + else: + result = result.reindex(join_columns, axis=1) + + return result + + def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool: + """ + Check if this is an operation between DataFrames that will need to reindex. + """ + if op is operator.pow or op is roperator.rpow: + # GH#32685 pow has special semantics for operating with null values + return False + + if not isinstance(right, DataFrame): + return False + + if fill_value is None and level is None and axis == 1: + # TODO: any other cases we should handle here? + + # Intersection is always unique so we have to check the unique columns + left_uniques = self.columns.unique() + right_uniques = right.columns.unique() + cols = left_uniques.intersection(right_uniques) + if len(cols) and not ( + len(cols) == len(left_uniques) and len(cols) == len(right_uniques) + ): + # TODO: is there a shortcut available when len(cols) == 0? + return True + + return False + + def _align_for_op( + self, + other, + axis: AxisInt, + flex: bool | None = False, + level: Level | None = None, + ): + """ + Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. + + Parameters + ---------- + left : DataFrame + right : Any + axis : int + flex : bool or None, default False + Whether this is a flex op, in which case we reindex. + None indicates not to check for alignment. + level : int or level name, default None + + Returns + ------- + left : DataFrame + right : Any + """ + left, right = self, other + + def to_series(right): + msg = ( + "Unable to coerce to Series, " + "length must be {req_len}: given {given_len}" + ) + + # pass dtype to avoid doing inference, which would break consistency + # with Index/Series ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + + if axis == 0: + if len(left.index) != len(right): + raise ValueError( + msg.format(req_len=len(left.index), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.index, dtype=dtype) + else: + if len(left.columns) != len(right): + raise ValueError( + msg.format(req_len=len(left.columns), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.columns, dtype=dtype) + return right + + if isinstance(right, np.ndarray): + if right.ndim == 1: + right = to_series(right) + + elif right.ndim == 2: + # We need to pass dtype=right.dtype to retain object dtype + # otherwise we lose consistency with Index and array ops + dtype = None + if right.dtype == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object + + if right.shape == left.shape: + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) + + elif right.shape[0] == left.shape[0] and right.shape[1] == 1: + # Broadcast across columns + right = np.broadcast_to(right, left.shape) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) + + elif right.shape[1] == left.shape[1] and right.shape[0] == 1: + # Broadcast along rows + right = to_series(right[0, :]) + + else: + raise ValueError( + "Unable to coerce to DataFrame, shape " + f"must be {left.shape}: given {right.shape}" + ) + + elif right.ndim > 2: + raise ValueError( + "Unable to coerce to Series/DataFrame, " + f"dimension must be <= 2: {right.shape}" + ) + + elif is_list_like(right) and not isinstance(right, (Series, DataFrame)): + # GH#36702. Raise when attempting arithmetic with list of array-like. + if any(is_array_like(el) for el in right): + raise ValueError( + f"Unable to coerce list of {type(right[0])} to Series/DataFrame" + ) + # GH#17901 + right = to_series(right) + + if flex is not None and isinstance(right, DataFrame): + if not left._indexed_same(right): + if flex: + left, right = left.align( + right, join="outer", level=level, copy=False + ) + else: + raise ValueError( + "Can only compare identically-labeled (both index and columns) " + "DataFrame objects" + ) + elif isinstance(right, Series): + # axis=1 is default for DataFrame-with-Series op + axis = axis if axis is not None else 1 + if not flex: + if not left.axes[axis].equals(right.index): + raise ValueError( + "Operands are not aligned. Do " + "`left, right = left.align(right, axis=1, copy=False)` " + "before operating." + ) + + left, right = left.align( + right, + join="outer", + axis=axis, + level=level, + copy=False, + ) + right = left._maybe_align_series_as_frame(right, axis) + + return left, right + + def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): + """ + If the Series operand is not EA-dtype, we can broadcast to 2D and operate + blockwise. + """ + rvalues = series._values + if not isinstance(rvalues, np.ndarray): + # TODO(EA2D): no need to special-case with 2D EAs + if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): + # We can losslessly+cheaply cast to ndarray + rvalues = np.asarray(rvalues) + else: + return series + + if axis == 0: + rvalues = rvalues.reshape(-1, 1) + else: + rvalues = rvalues.reshape(1, -1) + + rvalues = np.broadcast_to(rvalues, self.shape) + # pass dtype to avoid doing inference + return self._constructor( + rvalues, + index=self.index, + columns=self.columns, + dtype=rvalues.dtype, + ) + + def _flex_arith_method( + self, other, op, *, axis: Axis = "columns", level=None, fill_value=None + ): + axis = self._get_axis_number(axis) if axis is not None else 1 + + if self._should_reindex_frame_op(other, op, axis, fill_value, level): + return self._arith_method_with_reindex(other, op) + + if isinstance(other, Series) and fill_value is not None: + # TODO: We could allow this in cases where we end up going + # through the DataFrame path + raise NotImplementedError(f"fill_value {fill_value} not supported.") + + other = ops.maybe_prepare_scalar_for_op(other, self.shape) + self, other = self._align_for_op(other, axis, flex=True, level=level) + + with np.errstate(all="ignore"): + if isinstance(other, DataFrame): + # Another DataFrame + new_data = self._combine_frame(other, op, fill_value) + + elif isinstance(other, Series): + new_data = self._dispatch_frame_op(other, op, axis=axis) + else: + # in this case we always have `np.ndim(other) == 0` + if fill_value is not None: + self = self.fillna(fill_value) + + new_data = self._dispatch_frame_op(other, op) + + return self._construct_result(new_data) + + def _construct_result(self, result) -> DataFrame: + """ + Wrap the result of an arithmetic, comparison, or logical operation. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + """ + out = self._constructor(result, copy=False).__finalize__(self) + # Pin columns instead of passing to constructor for compat with + # non-unique columns case + out.columns = self.columns + out.index = self.index + return out + + def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = self // other + mod = self - div * other + return div, mod + + def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = other // self + mod = other - div * self + return div, mod + + def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): + axis = self._get_axis_number(axis) if axis is not None else 1 + + self, other = self._align_for_op(other, axis, flex=True, level=level) + + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + @Appender(ops.make_flex_doc("eq", "dataframe")) + def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) + + @Appender(ops.make_flex_doc("ne", "dataframe")) + def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) + + @Appender(ops.make_flex_doc("le", "dataframe")) + def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.le, axis=axis, level=level) + + @Appender(ops.make_flex_doc("lt", "dataframe")) + def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) + + @Appender(ops.make_flex_doc("ge", "dataframe")) + def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) + + @Appender(ops.make_flex_doc("gt", "dataframe")) + def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) + + @Appender(ops.make_flex_doc("add", "dataframe")) + def add( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.add, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("radd", "dataframe")) + def radd( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.radd, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("sub", "dataframe")) + def sub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.sub, level=level, fill_value=fill_value, axis=axis + ) + + subtract = sub + + @Appender(ops.make_flex_doc("rsub", "dataframe")) + def rsub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rsub, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("mul", "dataframe")) + def mul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.mul, level=level, fill_value=fill_value, axis=axis + ) + + multiply = mul + + @Appender(ops.make_flex_doc("rmul", "dataframe")) + def rmul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rmul, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("truediv", "dataframe")) + def truediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.truediv, level=level, fill_value=fill_value, axis=axis + ) + + div = truediv + divide = truediv + + @Appender(ops.make_flex_doc("rtruediv", "dataframe")) + def rtruediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis + ) + + rdiv = rtruediv + + @Appender(ops.make_flex_doc("floordiv", "dataframe")) + def floordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.floordiv, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) + def rfloordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("mod", "dataframe")) + def mod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.mod, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rmod", "dataframe")) + def rmod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rmod, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("pow", "dataframe")) + def pow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.pow, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rpow", "dataframe")) + def rpow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rpow, level=level, fill_value=fill_value, axis=axis + ) + + # ---------------------------------------------------------------------- + # Combination-Related + + @doc( + _shared_docs["compare"], + dedent( + """ + Returns + ------- + DataFrame + DataFrame that shows the differences stacked side by side. + + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + + Raises + ------ + ValueError + When the two DataFrames don't have identical labels or shape. + + See Also + -------- + Series.compare : Compare with another Series and show differences. + DataFrame.equals : Test whether two objects contain the same elements. + + Notes + ----- + Matching NaNs will not appear as a difference. + + Can only compare identically-labeled + (i.e. same shape, identical row and column labels) DataFrames + + Examples + -------- + >>> df = pd.DataFrame( + ... {{ + ... "col1": ["a", "a", "b", "b", "a"], + ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], + ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + ... }}, + ... columns=["col1", "col2", "col3"], + ... ) + >>> df + col1 col2 col3 + 0 a 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 3.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + >>> df2 = df.copy() + >>> df2.loc[0, 'col1'] = 'c' + >>> df2.loc[2, 'col3'] = 4.0 + >>> df2 + col1 col2 col3 + 0 c 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 4.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + Align the differences on columns + + >>> df.compare(df2) + col1 col3 + self other self other + 0 a c NaN NaN + 2 NaN NaN 3.0 4.0 + + Assign result_names + + >>> df.compare(df2, result_names=("left", "right")) + col1 col3 + left right left right + 0 a c NaN NaN + 2 NaN NaN 3.0 4.0 + + Stack the differences on rows + + >>> df.compare(df2, align_axis=0) + col1 col3 + 0 self a NaN + other c NaN + 2 self NaN 3.0 + other NaN 4.0 + + Keep the equal values + + >>> df.compare(df2, keep_equal=True) + col1 col3 + self other self other + 0 a c 1.0 1.0 + 2 b b 3.0 4.0 + + Keep all original rows and columns + + >>> df.compare(df2, keep_shape=True) + col1 col2 col3 + self other self other self other + 0 a c NaN NaN NaN NaN + 1 NaN NaN NaN NaN NaN NaN + 2 NaN NaN NaN NaN 3.0 4.0 + 3 NaN NaN NaN NaN NaN NaN + 4 NaN NaN NaN NaN NaN NaN + + Keep all original rows and columns and also all original values + + >>> df.compare(df2, keep_shape=True, keep_equal=True) + col1 col2 col3 + self other self other self other + 0 a c 1.0 1.0 1.0 1.0 + 1 a a 2.0 2.0 2.0 2.0 + 2 b b 3.0 3.0 3.0 4.0 + 3 b b NaN NaN 4.0 4.0 + 4 a a 5.0 5.0 5.0 5.0 + """ + ), + klass=_shared_doc_kwargs["klass"], + ) + def compare( + self, + other: DataFrame, + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, + result_names: Suffixes = ("self", "other"), + ) -> DataFrame: + return super().compare( + other=other, + align_axis=align_axis, + keep_shape=keep_shape, + keep_equal=keep_equal, + result_names=result_names, + ) + + def combine( + self, + other: DataFrame, + func: Callable[[Series, Series], Series | Hashable], + fill_value=None, + overwrite: bool = True, + ) -> DataFrame: + """ + Perform column-wise combine with another DataFrame. + + Combines a DataFrame with `other` DataFrame using `func` + to element-wise combine columns. The row and column indexes of the + resulting DataFrame will be the union of the two. + + Parameters + ---------- + other : DataFrame + The DataFrame to merge column-wise. + func : function + Function that takes two series as inputs and return a Series or a + scalar. Used to merge the two dataframes column by columns. + fill_value : scalar value, default None + The value to fill NaNs with prior to passing any column to the + merge func. + overwrite : bool, default True + If True, columns in `self` that do not exist in `other` will be + overwritten with NaNs. + + Returns + ------- + DataFrame + Combination of the provided DataFrames. + + See Also + -------- + DataFrame.combine_first : Combine two DataFrame objects and default to + non-null values in frame calling the method. + + Examples + -------- + Combine using a simple function that chooses the smaller column. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + Example using a true element-wise combine function. + + >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, np.minimum) + A B + 0 1 2 + 1 0 3 + + Using `fill_value` fills Nones prior to passing the column to the + merge function. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 4.0 + + However, if the same element in both dataframes is None, that None + is preserved + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 3.0 + + Example that demonstrates the use of `overwrite` and behavior when + the axis differ between the dataframes. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) + >>> df1.combine(df2, take_smaller) + A B C + 0 NaN NaN NaN + 1 NaN 3.0 -10.0 + 2 NaN 3.0 1.0 + + >>> df1.combine(df2, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 -10.0 + 2 NaN 3.0 1.0 + + Demonstrating the preference of the passed in dataframe. + + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) + >>> df2.combine(df1, take_smaller) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 NaN + 2 NaN 3.0 NaN + + >>> df2.combine(df1, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + other_idxlen = len(other.index) # save for compare + + this, other = self.align(other, copy=False) + new_index = this.index + + if other.empty and len(new_index) == len(self.index): + return self.copy() + + if self.empty and len(other) == other_idxlen: + return other.copy() + + # sorts if possible; otherwise align above ensures that these are set-equal + new_columns = this.columns.union(other.columns) + do_fill = fill_value is not None + result = {} + for col in new_columns: + series = this[col] + other_series = other[col] + + this_dtype = series.dtype + other_dtype = other_series.dtype + + this_mask = isna(series) + other_mask = isna(other_series) + + # don't overwrite columns unnecessarily + # DO propagate if this column is not in the intersection + if not overwrite and other_mask.all(): + result[col] = this[col].copy() + continue + + if do_fill: + series = series.copy() + other_series = other_series.copy() + series[this_mask] = fill_value + other_series[other_mask] = fill_value + + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: + series = series.astype(new_dtype, copy=False) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote + new_dtype = find_common_type([this_dtype, other_dtype]) + series = series.astype(new_dtype, copy=False) + other_series = other_series.astype(new_dtype, copy=False) + + arr = func(series, other_series) + if isinstance(new_dtype, np.dtype): + # if new_dtype is an EA Dtype, then `func` is expected to return + # the correct dtype without any additional casting + # error: No overload variant of "maybe_downcast_to_dtype" matches + # argument types "Union[Series, Hashable]", "dtype[Any]" + arr = maybe_downcast_to_dtype( # type: ignore[call-overload] + arr, new_dtype + ) + + result[col] = arr + + # convert_objects just in case + frame_result = self._constructor(result, index=new_index, columns=new_columns) + return frame_result.__finalize__(self, method="combine") + + def combine_first(self, other: DataFrame) -> DataFrame: + """ + Update null elements with value in the same location in `other`. + + Combine two DataFrame objects by filling null values in one DataFrame + with non-null values from other DataFrame. The row and column indexes + of the resulting DataFrame will be the union of the two. The resulting + dataframe contains the 'first' dataframe values and overrides the + second one values where both first.loc[index, col] and + second.loc[index, col] are not missing values, upon calling + first.combine_first(second). + + Parameters + ---------- + other : DataFrame + Provided DataFrame to use to fill null values. + + Returns + ------- + DataFrame + The result of combining the provided DataFrame with the other object. + + See Also + -------- + DataFrame.combine : Perform series-wise operation on two DataFrames + using a given function. + + Examples + -------- + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) + >>> df1.combine_first(df2) + A B C + 0 NaN 4.0 NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + from pandas.core.computation import expressions + + def combiner(x: Series, y: Series): + mask = x.isna()._values + + x_values = x._values + y_values = y._values + + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values + + return expressions.where(mask, y_values, x_values) + + if len(other) == 0: + combined = self.reindex( + self.columns.append(other.columns.difference(self.columns)), axis=1 + ) + combined = combined.astype(other.dtypes) + else: + combined = self.combine(other, combiner, overwrite=False) + + dtypes = { + col: find_common_type([self.dtypes[col], other.dtypes[col]]) + for col in self.columns.intersection(other.columns) + if combined.dtypes[col] != self.dtypes[col] + } + + if dtypes: + combined = combined.astype(dtypes) + + return combined.__finalize__(self, method="combine_first") + + def update( + self, + other, + join: UpdateJoin = "left", + overwrite: bool = True, + filter_func=None, + errors: IgnoreRaise = "ignore", + ) -> None: + """ + Modify in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + Returns + ------- + None + This method directly changes calling object. + + Raises + ------ + ValueError + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` + + See Also + -------- + dict.update : Similar method for dictionaries. + DataFrame.merge : For column(s)-on-column(s) operations. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + The DataFrame's length does not increase as a result of the update, + only values at matching index/column labels are updated. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b e + 2 c f + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b y + 2 c f + + For Series, its name attribute must be set. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e', 'f'], name='B') + >>> df.update(new_column) + >>> df + A B + 0 a d + 1 b e + 2 c f + + If `other` contains NaNs the corresponding values are not updated + in the original dataframe. + + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400., 500., 600.]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4.0 + 1 2 500.0 + 2 3 6.0 + """ + + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + # TODO: Support other joins + if join != "left": # pragma: no cover + raise NotImplementedError("Only left join is supported") + if errors not in ["ignore", "raise"]: + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + + if not isinstance(other, DataFrame): + other = DataFrame(other) + + other = other.reindex(self.index) + + for col in self.columns.intersection(other.columns): + this = self[col]._values + that = other[col]._values + + if filter_func is not None: + mask = ~filter_func(this) | isna(that) + else: + if errors == "raise": + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + continue + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) + + # ---------------------------------------------------------------------- + # Data reshaping + @Appender( + dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + >>> df.groupby(['Animal']).mean() + Max Speed + Animal + Falcon 375.0 + Parrot 25.0 + + **Hierarchical Indexes** + + We can groupby different levels of a hierarchical index + using the `level` parameter: + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, + ... index=index) + >>> df + Max Speed + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + >>> df.groupby(level=0).mean() + Max Speed + Animal + Falcon 370.0 + Parrot 25.0 + >>> df.groupby(level="Type").mean() + Max Speed + Type + Captive 210.0 + Wild 185.0 + + We can also choose to include NA in group keys or not by setting + `dropna` parameter, the default setting is `True`. + + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + + >>> df.groupby(by=["b"]).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + + >>> df.groupby(by=["b"], dropna=False).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + NaN 1 4 + + >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + + >>> df.groupby(by="a").sum() + b c + a + a 13.0 13.0 + b 12.3 123.0 + + >>> df.groupby(by="a", dropna=False).sum() + b c + a + a 13.0 13.0 + b 12.3 123.0 + NaN 12.3 33.0 + + When using ``.apply()``, use ``group_keys`` to include or exclude the + group keys. The ``group_keys`` argument defaults to ``True`` (include). + + >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed + Animal + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 + """ + ) + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis: Axis | lib.NoDefault = lib.no_default, + level: IndexLabel | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool | lib.NoDefault = lib.no_default, + dropna: bool = True, + ) -> DataFrameGroupBy: + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + if axis == 1: + warnings.warn( + "DataFrame.groupby with axis=1 is deprecated. Do " + "`frame.T.groupby(...)` without axis instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "The 'axis' keyword in DataFrame.groupby is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + + from pandas.core.groupby.generic import DataFrameGroupBy + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + + return DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + observed=observed, + dropna=dropna, + ) + + _shared_docs[ + "pivot" + ] = """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ----------%s + columns : str or object or a list of str + Column to use to make new frame's columns. + index : str or object or a list of str, optional + Column to use to make new frame's index. If not given, uses existing index. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', + ... 'two'], + ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], + ... 'baz': [1, 2, 3, 4, 5, 6], + ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index='foo', columns='bar', values='baz') + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar')['baz'] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame({ + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5]}) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], + ... "bar": ['A', 'A', 'B', 'C'], + ... "baz": [1, 2, 3, 4]}) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index='foo', columns='bar', values='baz') + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ + + @Substitution("") + @Appender(_shared_docs["pivot"]) + def pivot( + self, *, columns, index=lib.no_default, values=lib.no_default + ) -> DataFrame: + from pandas.core.reshape.pivot import pivot + + return pivot(self, index=index, columns=columns, values=values) + + _shared_docs[ + "pivot_table" + ] = """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ----------%s + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or list of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or list of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margin=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + rows with a NaN value in any column will be omitted before + computing margins. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. deprecated:: 2.2.0 + + The default value of ``False`` is deprecated and will change to + ``True`` in a future version of pandas. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc="sum") + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc="sum", fill_value=0) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': "mean", 'E': "mean"}) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': "mean", + ... 'E': ["min", "max", "mean"]}) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ + + @Substitution("") + @Appender(_shared_docs["pivot_table"]) + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc: AggFuncType = "mean", + fill_value=None, + margins: bool = False, + dropna: bool = True, + margins_name: Level = "All", + observed: bool | lib.NoDefault = lib.no_default, + sort: bool = True, + ) -> DataFrame: + from pandas.core.reshape.pivot import pivot_table + + return pivot_table( + self, + values=values, + index=index, + columns=columns, + aggfunc=aggfunc, + fill_value=fill_value, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + sort=sort, + ) + + def stack( + self, + level: IndexLabel = -1, + dropna: bool | lib.NoDefault = lib.no_default, + sort: bool | lib.NoDefault = lib.no_default, + future_stack: bool = False, + ): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index + axis, defined as one index or label, or a list of indices + or labels. + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with + missing values. Stacking a column level onto the index + axis can create combinations of index and column values + that are missing from the original dataframe. See Examples + section. + sort : bool, default True + Whether to sort the levels of the resulting MultiIndex. + future_stack : bool, default False + Whether to use the new implementation that will replace the current + implementation in pandas 3.0. When True, dropna and sort have no impact + on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release + notes ` for more details. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + + See Also + -------- + DataFrame.unstack : Unstack prescribed level(s) from index axis + onto column axis. + DataFrame.pivot : Reshape dataframe from long format to wide + format. + DataFrame.pivot_table : Create a spreadsheet-style pivot table + as a DataFrame. + + Notes + ----- + The function is named by analogy with a collection of books + being reorganized from being side by side on a horizontal + position (the columns of the dataframe) to being stacked + vertically on top of each other (in the index of the + dataframe). + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + **Single level columns** + + >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], + ... index=['cat', 'dog'], + ... columns=['weight', 'height']) + + Stacking a dataframe with a single level column axis returns a Series: + + >>> df_single_level_cols + weight height + cat 0 1 + dog 2 3 + >>> df_single_level_cols.stack(future_stack=True) + cat weight 0 + height 1 + dog weight 2 + height 3 + dtype: int64 + + **Multi level columns: simple case** + + >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('weight', 'pounds')]) + >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], + ... index=['cat', 'dog'], + ... columns=multicol1) + + Stacking a dataframe with a multi-level column axis: + + >>> df_multi_level_cols1 + weight + kg pounds + cat 1 2 + dog 2 4 + >>> df_multi_level_cols1.stack(future_stack=True) + weight + cat kg 1 + pounds 2 + dog kg 2 + pounds 4 + + **Missing values** + + >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('height', 'm')]) + >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + It is common to have missing values when stacking a dataframe + with multi-level columns, as the stacked dataframe typically + has more values than the original dataframe. Missing values + are filled with NaNs: + + >>> df_multi_level_cols2 + weight height + kg m + cat 1.0 2.0 + dog 3.0 4.0 + >>> df_multi_level_cols2.stack(future_stack=True) + weight height + cat kg 1.0 NaN + m NaN 2.0 + dog kg 3.0 NaN + m NaN 4.0 + + **Prescribing the level(s) to be stacked** + + The first parameter controls which level or levels are stacked: + + >>> df_multi_level_cols2.stack(0, future_stack=True) + kg m + cat weight 1.0 NaN + height NaN 2.0 + dog weight 3.0 NaN + height NaN 4.0 + >>> df_multi_level_cols2.stack([0, 1], future_stack=True) + cat weight kg 1.0 + height m 2.0 + dog weight kg 3.0 + height m 4.0 + dtype: float64 + """ + if not future_stack: + from pandas.core.reshape.reshape import ( + stack, + stack_multiple, + ) + + if ( + dropna is not lib.no_default + or sort is not lib.no_default + or self.columns.nlevels > 1 + ): + warnings.warn( + "The previous implementation of stack is deprecated and will be " + "removed in a future version of pandas. See the What's New notes " + "for pandas 2.1.0 for details. Specify future_stack=True to adopt " + "the new implementation and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if dropna is lib.no_default: + dropna = True + if sort is lib.no_default: + sort = True + + if isinstance(level, (tuple, list)): + result = stack_multiple(self, level, dropna=dropna, sort=sort) + else: + result = stack(self, level, dropna=dropna, sort=sort) + else: + from pandas.core.reshape.reshape import stack_v3 + + if dropna is not lib.no_default: + raise ValueError( + "dropna must be unspecified with future_stack=True as the new " + "implementation does not introduce rows of NA values. This " + "argument will be removed in a future version of pandas." + ) + + if sort is not lib.no_default: + raise ValueError( + "Cannot specify sort with future_stack=True, this argument will be " + "removed in a future version of pandas. Sort the result using " + ".sort_index instead." + ) + + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v3(self, level) + + return result.__finalize__(self, method="stack") + + def explode( + self, + column: IndexLabel, + ignore_index: bool = False, + ) -> DataFrame: + """ + Transform each element of a list-like to a row, replicating index values. + + Parameters + ---------- + column : IndexLabel + Column(s) to explode. + For multiple columns, specify a non-empty list with each element + be str or tuple, and all specified columns their list-like data + on same row of the frame must have matching length. + + .. versionadded:: 1.3.0 + Multi-column explode + + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + + Raises + ------ + ValueError : + * If columns of the frame are not unique. + * If specified columns to explode is empty list. + * If specified columns to explode have not matching count of + elements rowwise in the frame. + + See Also + -------- + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, sets, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df + A B C + 0 [0, 1, 2] 1 [a, b, c] + 1 foo 1 NaN + 2 [] 1 [] + 3 [3, 4] 1 [d, e] + + Single-column explode. + + >>> df.explode('A') + A B C + 0 0 1 [a, b, c] + 0 1 1 [a, b, c] + 0 2 1 [a, b, c] + 1 foo 1 NaN + 2 NaN 1 [] + 3 3 1 [d, e] + 3 4 1 [d, e] + + Multi-column explode. + + >>> df.explode(list('AC')) + A B C + 0 0 1 a + 0 1 1 b + 0 2 1 c + 1 foo 1 NaN + 2 NaN 1 NaN + 3 3 1 d + 3 4 1 e + """ + if not self.columns.is_unique: + duplicate_cols = self.columns[self.columns.duplicated()].tolist() + raise ValueError( + f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}" + ) + + columns: list[Hashable] + if is_scalar(column) or isinstance(column, tuple): + columns = [column] + elif isinstance(column, list) and all( + is_scalar(c) or isinstance(c, tuple) for c in column + ): + if not column: + raise ValueError("column must be nonempty") + if len(column) > len(set(column)): + raise ValueError("column must be unique") + columns = column + else: + raise ValueError("column must be a scalar, tuple, or list thereof") + + df = self.reset_index(drop=True) + if len(columns) == 1: + result = df[columns[0]].explode() + else: + mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1 + counts0 = self[columns[0]].apply(mylen) + for c in columns[1:]: + if not all(counts0 == self[c].apply(mylen)): + raise ValueError("columns must have matching element counts") + result = DataFrame({c: df[c].explode() for c in columns}) + result = df.drop(columns, axis=1).join(result) + if ignore_index: + result.index = default_index(len(result)) + else: + result.index = self.index.take(result.index) + result = result.reindex(columns=self.columns, copy=False) + + return result.__finalize__(self, method="explode") + + def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): + """ + Pivot a level of the (necessarily hierarchical) index labels. + + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + + Parameters + ---------- + level : int, str, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name. + fill_value : int, str or dict + Replace NaN with this value if the unstack produces missing values. + sort : bool, default True + Sort the level(s) in the resulting MultiIndex columns. + + Returns + ------- + Series or DataFrame + + See Also + -------- + DataFrame.pivot : Pivot a table based on column values. + DataFrame.stack : Pivot a level of the column labels (inverse operation + from `unstack`). + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + + >>> s.unstack(level=-1) + a b + one 1.0 2.0 + two 3.0 4.0 + + >>> s.unstack(level=0) + one two + a 1.0 3.0 + b 2.0 4.0 + + >>> df = s.unstack(level=0) + >>> df.unstack() + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + """ + from pandas.core.reshape.reshape import unstack + + result = unstack(self, level, fill_value, sort) + + return result.__finalize__(self, method="unstack") + + @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name: Hashable = "value", + col_level: Level | None = None, + ignore_index: bool = True, + ) -> DataFrame: + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ignore_index=ignore_index, + ).__finalize__(self, method="melt") + + # ---------------------------------------------------------------------- + # Time series-related + + @doc( + Series.diff, + klass="DataFrame", + extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " + "Take difference over rows (0) or columns (1).\n", + other_klass="Series", + examples=dedent( + """ + Difference with previous row + + >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1) + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 + + Difference with 3rd previous row + + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN + + Overflow in input dtype + + >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) + >>> df.diff() + a + 0 NaN + 1 255.0""" + ), + ) + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not lib.is_integer(periods): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) + + axis = self._get_axis_number(axis) + if axis == 1: + if periods != 0: + # in the periods == 0 case, this is equivalent diff of 0 periods + # along axis=0, and the Manager method may be somewhat more + # performant, so we dispatch in that case. + return self - self.shift(periods, axis=axis) + # With periods=0 this is equivalent to a diff with axis=0 + axis = 0 + + new_data = self._mgr.diff(n=periods) + res_df = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res_df.__finalize__(self, "diff") + + # ---------------------------------------------------------------------- + # Function application + + def _gotitem( + self, + key: IndexLabel, + ndim: int, + subset: DataFrame | Series | None = None, + ) -> DataFrame | Series: + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + elif subset.ndim == 1: # is Series + return subset + + # TODO: _shallow_copy(subset)? + return subset[key] + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.apply : Perform any type of operations. + DataFrame.transform : Perform transformation type operations. + pandas.DataFrame.groupby : Perform operations over groups. + pandas.DataFrame.resample : Perform operations over resampled bins. + pandas.DataFrame.rolling : Perform operations over rolling window. + pandas.DataFrame.expanding : Perform operations over expanding window. + pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + weighted window. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], + ... [4, 5, 6], + ... [7, 8, 9], + ... [np.nan, np.nan, np.nan]], + ... columns=['A', 'B', 'C']) + + Aggregate these functions over the rows. + + >>> df.agg(['sum', 'min']) + A B C + sum 12.0 15.0 18.0 + min 1.0 2.0 3.0 + + Different aggregations per column. + + >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + A B + sum 12.0 NaN + min 1.0 2.0 + max NaN 8.0 + + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. + + >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 + + Aggregate over the columns. + + >>> df.agg("mean", axis="columns") + 0 2.0 + 1 5.0 + 2 8.0 + 3 NaN + dtype: float64 + """ + ) + + @doc( + _shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + ) + def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): + from pandas.core.apply import frame_apply + + axis = self._get_axis_number(axis) + + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.agg() + result = reconstruct_and_relabel_result(result, func, **kwargs) + return result + + agg = aggregate + + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: + from pandas.core.apply import frame_apply + + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.transform() + assert isinstance(result, DataFrame) + return result + + def apply( + self, + func: AggFuncType, + axis: Axis = 0, + raw: bool = False, + result_type: Literal["expand", "reduce", "broadcast"] | None = None, + args=(), + by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, + **kwargs, + ): + """ + Apply a function along an axis of the DataFrame. + + Objects passed to the function are Series objects whose index is + either the DataFrame's index (``axis=0``) or the DataFrame's columns + (``axis=1``). By default (``result_type=None``), the final return type + is inferred from the return type of the applied function. Otherwise, + it depends on the `result_type` argument. + + Parameters + ---------- + func : function + Function to apply to each column or row. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + raw : bool, default False + Determines if row or column is passed as a Series or ndarray object: + + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects + instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + result_type : {'expand', 'reduce', 'broadcast', None}, default None + These only act when ``axis=1`` (columns): + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : returns a Series if possible rather than expanding + list-like results. This is the opposite of 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the DataFrame, the original index and columns will be + retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + args : tuple + Positional arguments to pass to `func` in addition to the + array/series. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. + + .. versionadded:: 2.1.0 + + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : + + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + Note: Due to limitations within numba/how pandas interfaces with numba, + you should only use this if raw=True + + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + + .. versionadded:: 2.2.0 + + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. + + See Also + -------- + DataFrame.map: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df + A B + 0 4 9 + 1 4 9 + 2 4 9 + + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): + + >>> df.apply(np.sqrt) + A B + 0 2.0 3.0 + 1 2.0 3.0 + 2 2.0 3.0 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 12 + B 27 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 13 + 1 13 + 2 13 + dtype: int64 + + Returning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + dtype: object + + Passing ``result_type='expand'`` will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + A B + 0 1 2 + 1 1 2 + 2 1 2 + """ + from pandas.core.apply import frame_apply + + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + return op.apply().__finalize__(self, method="apply") + + def map( + self, func: PythonFuncType, na_action: str | None = None, **kwargs + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + .. versionadded:: 2.1.0 + + DataFrame.applymap was deprecated and renamed to DataFrame.map. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + DataFrame + Transformed DataFrame. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.replace: Replace values given in `to_replace` with `value`. + Series.map : Apply a function elementwise on a Series. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.map(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + + Like Series.map, NA values can be ignored: + + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') + 0 1 + 0 NaN 4 + 1 5.0 5 + + It is also possible to use `map` with functions that are not + `lambda` functions: + + >>> df.map(round, ndigits=1) + 0 1 + 0 1.0 2.1 + 1 3.4 4.6 + + Note that a vectorized version of `func` often exists, which will + be much faster. You could square each number elementwise. + + >>> df.map(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + + But it's better to avoid map in that case. + + >>> df ** 2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + """ + if na_action not in {"ignore", None}: + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) + + if self.empty: + return self.copy() + + func = functools.partial(func, **kwargs) + + def infer(x): + return x._map_values(func, na_action=na_action) + + return self.apply(infer).__finalize__(self, "map") + + def applymap( + self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + .. deprecated:: 2.1.0 + + DataFrame.applymap has been deprecated. Use DataFrame.map instead. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + DataFrame + Transformed DataFrame. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.map : Apply a function along input axis of DataFrame. + DataFrame.replace: Replace values given in `to_replace` with `value`. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.map(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + """ + warnings.warn( + "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map(func, na_action=na_action, **kwargs) + + # ---------------------------------------------------------------------- + # Merging / joining methods + + def _append( + self, + other, + ignore_index: bool = False, + verify_integrity: bool = False, + sort: bool = False, + ) -> DataFrame: + if isinstance(other, (Series, dict)): + if isinstance(other, dict): + if not ignore_index: + raise TypeError("Can only append a dict if ignore_index=True") + other = Series(other) + if other.name is None and not ignore_index: + raise TypeError( + "Can only append a Series if ignore_index=True " + "or if the Series has a name" + ) + + index = Index( + [other.name], + name=self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name, + ) + row_df = other.to_frame().T + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects(copy=False).rename_axis( + index.names, copy=False + ) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if self.index.name is not None and not ignore_index: + other.index.name = self.index.name + + from pandas.core.reshape.concat import concat + + if isinstance(other, (list, tuple)): + to_concat = [self, *other] + else: + to_concat = [self, other] + + result = concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + return result.__finalize__(self, method="append") + + def join( + self, + other: DataFrame | Series | Iterable[DataFrame | Series], + on: IndexLabel | None = None, + how: MergeHow = "left", + lsuffix: str = "", + rsuffix: str = "", + sort: bool = False, + validate: JoinValidate | None = None, + ) -> DataFrame: + """ + Join columns of another DataFrame. + + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series, or a list containing any combination of them + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame. + on : str, list of str, or array-like, optional + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation. + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use `other`'s index. + * outer: form union of calling frame's index (or column if on is + specified) with `other`'s index, and sort it lexicographically. + * inner: form intersection of calling frame's index (or column if + on is specified) with `other`'s index, preserving the order + of the calling's one. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword). + validate : str, optional + If specified, checks if join is of specified type. + + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame + A dataframe containing columns from both the caller and `other`. + + See Also + -------- + DataFrame.merge : For column(s)-on-column(s) operations. + + Notes + ----- + Parameters `on`, `lsuffix`, and `rsuffix` are not supported when + passing a list of `DataFrame` objects. + + Examples + -------- + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K2 A2 + 3 K3 A3 + 4 K4 A4 + 5 K5 A5 + + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other + key B + 0 K0 B0 + 1 K1 B1 + 2 K2 B2 + + Join DataFrames using their indexes. + + >>> df.join(other, lsuffix='_caller', rsuffix='_other') + key_caller A key_other B + 0 K0 A0 K0 B0 + 1 K1 A1 K1 B1 + 2 K2 A2 K2 B2 + 3 K3 A3 NaN NaN + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN + + If we want to join using the key columns, we need to set key to be + the index in both `df` and `other`. The joined DataFrame will have + key as its index. + + >>> df.set_index('key').join(other.set_index('key')) + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can use + any column in `df`. This method preserves the original DataFrame's + index in the result. + + >>> df.join(other.set_index('key'), on='key') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + + Using non-unique key values shows how they are matched. + + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K1 A2 + 3 K3 A3 + 4 K0 A4 + 5 K1 A5 + + >>> df.join(other.set_index('key'), on='key', validate='m:1') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K1 A2 B1 + 3 K3 A3 NaN + 4 K0 A4 B0 + 5 K1 A5 B1 + """ + from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge + + if isinstance(other, Series): + if other.name is None: + raise ValueError("Other Series must have a name") + other = DataFrame({other.name: other}) + + if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) + else: + if on is not None: + raise ValueError( + "Joining multiple DataFrames only supported for joining on index" + ) + + if rsuffix or lsuffix: + raise ValueError( + "Suffixes not supported when joining multiple DataFrames" + ) + + # Mypy thinks the RHS is a + # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas + # the LHS is an "Iterable[DataFrame]", but in reality both types are + # "Iterable[Union[DataFrame, Series]]" due to the if statements + frames = [cast("DataFrame | Series", self)] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + # join indexes only using concat + if can_concat: + if how == "left": + res = concat( + frames, axis=1, join="outer", verify_integrity=True, sort=sort + ) + return res.reindex(self.index, copy=False) + else: + return concat( + frames, axis=1, join=how, verify_integrity=True, sort=sort + ) + + joined = frames[0] + + for frame in frames[1:]: + joined = merge( + joined, + frame, + how=how, + left_index=True, + right_index=True, + validate=validate, + ) + + return joined + + @Substitution("") + @Appender(_merge_doc, indents=2) + def merge( + self, + right: DataFrame | Series, + how: MergeHow = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | None = None, + indicator: str | bool = False, + validate: MergeValidate | None = None, + ) -> DataFrame: + from pandas.core.reshape.merge import merge + + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) + + def round( + self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs + ) -> DataFrame: + """ + Round a DataFrame to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + *args + Additional keywords have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + See Also + -------- + numpy.around : Round a numpy array to the given number of decimals. + Series.round : Round a Series to the given number of decimals. + + Examples + -------- + >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + from pandas.core.reshape.concat import concat + + def _dict_round(df: DataFrame, decimals): + for col, vals in df.items(): + try: + yield _series_round(vals, decimals[col]) + except KeyError: + yield vals + + def _series_round(ser: Series, decimals: int) -> Series: + if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): + return ser.round(decimals) + return ser + + nv.validate_round(args, kwargs) + + if isinstance(decimals, (dict, Series)): + if isinstance(decimals, Series) and not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + if is_dict_like(decimals) and not all( + is_integer(value) for _, value in decimals.items() + ): + raise TypeError("Values in decimals must be integers") + new_cols = list(_dict_round(self, decimals)) + elif is_integer(decimals): + # Dispatch to Block.round + # Argument "decimals" to "round" of "BaseBlockManager" has incompatible + # type "Union[int, integer[Any]]"; expected "int" + new_mgr = self._mgr.round( + decimals=decimals, # type: ignore[arg-type] + using_cow=using_copy_on_write(), + ) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( + self, method="round" + ) + else: + raise TypeError("decimals must be an integer, a dict-like or a Series") + + if new_cols is not None and len(new_cols) > 0: + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ).__finalize__(self, method="round") + else: + return self.copy(deep=False) + + # ---------------------------------------------------------------------- + # Statistical methods, etc. + + def corr( + self, + method: CorrelationMethod = "pearson", + min_periods: int = 1, + numeric_only: bool = False, + ) -> DataFrame: + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. Note that the returned matrix from corr + will have 1 along the diagonals and will be symmetric + regardless of the callable's behavior. + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for Pearson + and Spearman correlation. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + Returns + ------- + DataFrame + Correlation matrix. + + See Also + -------- + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Series.corr : Compute the correlation between two Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) + >>> df.corr(method=histogram_intersection) + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 + + >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], + ... columns=['dogs', 'cats']) + >>> df.corr(min_periods=3) + dogs cats + dogs 1.0 NaN + cats NaN 1.0 + """ # noqa: E501 + data = self._get_numeric_data() if numeric_only else self + cols = data.columns + idx = cols.copy() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + + if method == "pearson": + correl = libalgos.nancorr(mat, minp=min_periods) + elif method == "spearman": + correl = libalgos.nancorr_spearman(mat, minp=min_periods) + elif method == "kendall" or callable(method): + if min_periods is None: + min_periods = 1 + mat = mat.T + corrf = nanops.get_corr_func(method) + K = len(cols) + correl = np.empty((K, K), dtype=float) + mask = np.isfinite(mat) + for i, ac in enumerate(mat): + for j, bc in enumerate(mat): + if i > j: + continue + + valid = mask[i] & mask[j] + if valid.sum() < min_periods: + c = np.nan + elif i == j: + c = 1.0 + elif not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + else: + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) + + result = self._constructor(correl, index=idx, columns=cols, copy=False) + return result.__finalize__(self, method="corr") + + def cov( + self, + min_periods: int | None = None, + ddof: int | None = 1, + numeric_only: bool = False, + ) -> DataFrame: + """ + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. + + Parameters + ---------- + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + This argument is applicable only when no ``nan`` is in the dataframe. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + Returns + ------- + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + Series.cov : Compute covariance with another Series. + core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample + covariance. + core.window.expanding.Expanding.cov : Expanding sample covariance. + core.window.rolling.Rolling.cov : Rolling sample covariance. + + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-ddof. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 + """ + data = self._get_numeric_data() if numeric_only else self + cols = data.columns + idx = cols.copy() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + + if notna(mat).all(): + if min_periods is not None and min_periods > len(mat): + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) + else: + base_cov = np.cov(mat.T, ddof=ddof) + base_cov = base_cov.reshape((len(cols), len(cols))) + else: + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) + + result = self._constructor(base_cov, index=idx, columns=cols, copy=False) + return result.__finalize__(self, method="cov") + + def corrwith( + self, + other: DataFrame | Series, + axis: Axis = 0, + drop: bool = False, + method: CorrelationMethod = "pearson", + numeric_only: bool = False, + ) -> Series: + """ + Compute pairwise correlation. + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for + column-wise. + drop : bool, default False + Drop missing indices from result. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + Returns + ------- + Series + Pairwise correlations. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation of columns. + + Examples + -------- + >>> index = ["a", "b", "c", "d", "e"] + >>> columns = ["one", "two", "three", "four"] + >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) + >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1.corrwith(df2) + one 1.0 + two 1.0 + three 1.0 + four 1.0 + dtype: float64 + + >>> df2.corrwith(df1, axis=1) + a 1.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 + """ # noqa: E501 + axis = self._get_axis_number(axis) + this = self._get_numeric_data() if numeric_only else self + + if isinstance(other, Series): + return this.apply(lambda x: other.corr(x, method=method), axis=axis) + + if numeric_only: + other = other._get_numeric_data() + left, right = this.align(other, join="inner", copy=False) + + if axis == 1: + left = left.T + right = right.T + + if method == "pearson": + # mask missing values + left = left + right * 0 + right = right + left * 0 + + # demeaned data + ldem = left - left.mean(numeric_only=numeric_only) + rdem = right - right.mean(numeric_only=numeric_only) + + num = (ldem * rdem).sum() + dom = ( + (left.count() - 1) + * left.std(numeric_only=numeric_only) + * right.std(numeric_only=numeric_only) + ) + + correl = num / dom + + elif method in ["kendall", "spearman"] or callable(method): + + def c(x): + return nanops.nancorr(x[0], x[1], method=method) + + correl = self._constructor_sliced( + map(c, zip(left.values.T, right.values.T)), + index=left.columns, + copy=False, + ) + + else: + raise ValueError( + f"Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable" + ) + + if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) + raxis: AxisInt = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + def count(self, axis: Axis = 0, numeric_only: bool = False): + """ + Count non-NA cells for each column or row. + + The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index' counts are generated for each column. + If 1 or 'columns' counts are generated for each row. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + Returns + ------- + Series + For each column/row the number of non-NA/null entries. + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.value_counts: Count unique combinations of columns. + DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). + DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements. + + Examples + -------- + Constructing DataFrame from a dictionary: + + >>> df = pd.DataFrame({"Person": + ... ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24., np.nan, 21., 33, 26], + ... "Single": [False, True, True, True, False]}) + >>> df + Person Age Single + 0 John 24.0 False + 1 Myla NaN True + 2 Lewis 21.0 True + 3 John 33.0 True + 4 Myla 26.0 False + + Notice the uncounted NA values: + + >>> df.count() + Person 5 + Age 4 + Single 5 + dtype: int64 + + Counts for each **row**: + + >>> df.count(axis='columns') + 0 3 + 1 2 + 2 3 + 3 3 + 4 3 + dtype: int64 + """ + axis = self._get_axis_number(axis) + + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) + else: + result = notna(frame).sum(axis=axis) + + return result.astype("int64", copy=False).__finalize__(self, method="count") + + def _reduce( + self, + op, + name: str, + *, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + filter_type=None, + **kwds, + ): + assert filter_type is None or filter_type == "bool", filter_type + out_dtype = "bool" if filter_type == "bool" else None + + if axis is not None: + axis = self._get_axis_number(axis) + + def func(values: np.ndarray): + # We only use this in the case that operates on self.values + return op(values, axis=axis, skipna=skipna, **kwds) + + dtype_has_keepdims: dict[ExtensionDtype, bool] = {} + + def blk_func(values, axis: Axis = 1): + if isinstance(values, ExtensionArray): + if not is_1d_only_ea_dtype(values.dtype) and not isinstance( + self._mgr, ArrayManager + ): + return values._reduce(name, axis=1, skipna=skipna, **kwds) + has_keepdims = dtype_has_keepdims.get(values.dtype) + if has_keepdims is None: + sign = signature(values._reduce) + has_keepdims = "keepdims" in sign.parameters + dtype_has_keepdims[values.dtype] = has_keepdims + if has_keepdims: + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + else: + warnings.warn( + f"{type(values)}._reduce will require a `keepdims` parameter " + "in the future", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = values._reduce(name, skipna=skipna, **kwds) + return np.array([result]) + else: + return op(values, axis=axis, skipna=skipna, **kwds) + + def _get_data() -> DataFrame: + if filter_type is None: + data = self._get_numeric_data() + else: + # GH#25101, GH#24434 + assert filter_type == "bool" + data = self._get_bool_data() + return data + + # Case with EAs see GH#35881 + df = self + if numeric_only: + df = _get_data() + if axis is None: + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype): + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) + return func(df.values) + elif axis == 1: + if len(df.index) == 0: + # Taking a transpose would result in no columns, losing the dtype. + # In the empty case, reducing along axis 0 or 1 gives the same + # result dtype, so reduce with axis=0 and ignore values + result = df._reduce( + op, + name, + axis=0, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ).iloc[:0] + result.index = df.index + return result + + # kurtosis excluded since groupby does not implement it + if df.shape[1] and name != "kurt": + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype): + # GH 54341: fastpath for EA-backed axis=1 reductions + # This flattens the frame into a single 1D array while keeping + # track of the row and column indices of the original frame. Once + # flattened, grouping by the row indices and aggregating should + # be equivalent to transposing the original frame and aggregating + # with axis=0. + name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + nrows, ncols = df.shape + row_index = np.tile(np.arange(nrows), ncols) + col_index = np.repeat(np.arange(ncols), nrows) + ser = Series(arr, index=col_index, copy=False) + # GroupBy will raise a warning with SeriesGroupBy as the object, + # likely confusing users + with rewrite_warning( + target_message=( + f"The behavior of SeriesGroupBy.{name} with all-NA values" + ), + target_category=FutureWarning, + new_message=( + f"The behavior of {type(self).__name__}.{name} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In " + "a future version this will raise ValueError" + ), + ): + result = ser.groupby(row_index).agg(name, **kwds) + result.index = df.index + if not skipna and name not in ("any", "all"): + mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) + other = -1 if name in ("idxmax", "idxmin") else lib.no_default + result = result.mask(mask, other) + return result + + df = df.T + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res = df._mgr.reduce(blk_func) + out = df._constructor_from_mgr(res, axes=res.axes).iloc[0] + if out_dtype is not None and out.dtype != "boolean": + out = out.astype(out_dtype) + elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]: + out = out.astype(object) + elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) + + return out + + def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: + """ + Special case for _reduce to try to avoid a potentially-expensive transpose. + + Apply the reduction block-wise along axis=1 and then reduce the resulting + 1D arrays. + """ + if name == "all": + result = np.ones(len(self), dtype=bool) + ufunc = np.logical_and + elif name == "any": + result = np.zeros(len(self), dtype=bool) + # error: Incompatible types in assignment + # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], + # Literal[20], Literal[False]]", variable has type + # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], + # Literal[True]]") + ufunc = np.logical_or # type: ignore[assignment] + else: + raise NotImplementedError(name) + + for arr in self._mgr.arrays: + middle = func(arr, axis=0, skipna=skipna) + result = ufunc(result, middle) + + res_ser = self._constructor_sliced(result, index=self.index, copy=False) + return res_ser + + @doc(make_doc("any", ndim=2)) + # error: Signature of "any" incompatible with supertype "NDFrame" + def any( # type: ignore[override] + self, + *, + axis: Axis | None = 0, + bool_only: bool = False, + skipna: bool = True, + **kwargs, + ) -> Series | bool: + result = self._logical_func( + "any", nanops.nanany, axis, bool_only, skipna, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="any") + return result + + @doc(make_doc("all", ndim=2)) + def all( + self, + axis: Axis | None = 0, + bool_only: bool = False, + skipna: bool = True, + **kwargs, + ) -> Series | bool: + result = self._logical_func( + "all", nanops.nanall, axis, bool_only, skipna, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="all") + return result + + @doc(make_doc("min", ndim=2)) + def min( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().min(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="min") + return result + + @doc(make_doc("max", ndim=2)) + def max( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().max(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="max") + return result + + @doc(make_doc("sum", ndim=2)) + def sum( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + min_count: int = 0, + **kwargs, + ): + result = super().sum(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="sum") + + @doc(make_doc("prod", ndim=2)) + def prod( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + min_count: int = 0, + **kwargs, + ): + result = super().prod(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="prod") + + @doc(make_doc("mean", ndim=2)) + def mean( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().mean(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="mean") + return result + + @doc(make_doc("median", ndim=2)) + def median( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().median(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="median") + return result + + @doc(make_doc("sem", ndim=2)) + def sem( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="sem") + return result + + @doc(make_doc("var", ndim=2)) + def var( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + result = super().var(axis, skipna, ddof, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="var") + return result + + @doc(make_doc("std", ndim=2)) + def std( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + result = super().std(axis, skipna, ddof, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="std") + return result + + @doc(make_doc("skew", ndim=2)) + def skew( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().skew(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="skew") + return result + + @doc(make_doc("kurt", ndim=2)) + def kurt( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + result = super().kurt(axis, skipna, numeric_only, **kwargs) + if isinstance(result, Series): + result = result.__finalize__(self, method="kurt") + return result + + kurtosis = kurt + product = prod + + @doc(make_doc("cummin", ndim=2)) + def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummin(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cummax", ndim=2)) + def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummax(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumsum", ndim=2)) + def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumprod", 2)) + def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) + + def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: + """ + Count number of distinct elements in specified axis. + + Return Series with number of distinct elements. Can ignore NaN + values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + See Also + -------- + Series.nunique: Method nunique for Series. + DataFrame.count: Count non-NA cells for each column or row. + + Examples + -------- + >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 + dtype: int64 + """ + return self.apply(Series.nunique, axis=axis, dropna=dropna) + + @doc(_shared_docs["idxmin"], numeric_only_default="False") + def idxmin( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: + axis = self._get_axis_number(axis) + + if self.empty and len(self.axes[axis]): + axis_dtype = self.axes[axis].dtype + return self._constructor_sliced(dtype=axis_dtype) + + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + res = data._reduce( + nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values + # indices will always be np.ndarray since axis is not N + + if (indices == -1).any(): + warnings.warn( + f"The behavior of {type(self).__name__}.idxmin with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + index = data._get_axis(axis) + result = algorithms.take( + index._values, indices, allow_fill=True, fill_value=index._na_value + ) + final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) + return final_result.__finalize__(self, method="idxmin") + + @doc(_shared_docs["idxmax"], numeric_only_default="False") + def idxmax( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: + axis = self._get_axis_number(axis) + + if self.empty and len(self.axes[axis]): + axis_dtype = self.axes[axis].dtype + return self._constructor_sliced(dtype=axis_dtype) + + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + res = data._reduce( + nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values + # indices will always be 1d array since axis is not None + + if (indices == -1).any(): + warnings.warn( + f"The behavior of {type(self).__name__}.idxmax with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + index = data._get_axis(axis) + result = algorithms.take( + index._values, indices, allow_fill=True, fill_value=index._na_value + ) + final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) + return final_result.__finalize__(self, method="idxmax") + + def _get_agg_axis(self, axis_num: int) -> Index: + """ + Let's be explicit about this. + """ + if axis_num == 0: + return self.columns + elif axis_num == 1: + return self.index + else: + raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") + + def mode( + self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True + ) -> DataFrame: + """ + Get the mode(s) of each element along the selected axis. + + The mode of a set of values is the value that appears most often. + It can be multiple values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to iterate over while searching for the mode: + + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row. + + numeric_only : bool, default False + If True, only apply to numeric columns. + dropna : bool, default True + Don't consider counts of NaN/NaT. + + Returns + ------- + DataFrame + The modes of each column or row. + + See Also + -------- + Series.mode : Return the highest frequency value in a Series. + Series.value_counts : Return the counts of values in a Series. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) + >>> df + species legs wings + falcon bird 2 2.0 + horse mammal 4 NaN + spider arthropod 8 0.0 + ostrich bird 2 NaN + + By default, missing values are not considered, and the mode of wings + are both 0 and 2. Because the resulting DataFrame has two rows, + the second row of ``species`` and ``legs`` contains ``NaN``. + + >>> df.mode() + species legs wings + 0 bird 2.0 0.0 + 1 NaN NaN 2.0 + + Setting ``dropna=False`` ``NaN`` values are considered and they can be + the mode (like for wings). + + >>> df.mode(dropna=False) + species legs wings + 0 bird 2 NaN + + Setting ``numeric_only=True``, only the mode of numeric columns is + computed, and columns of other types are ignored. + + >>> df.mode(numeric_only=True) + legs wings + 0 2.0 0.0 + 1 NaN 2.0 + + To compute the mode over columns and not rows, use the axis parameter: + + >>> df.mode(axis='columns', numeric_only=True) + 0 1 + falcon 2.0 NaN + horse 4.0 NaN + spider 0.0 8.0 + ostrich 2.0 NaN + """ + data = self if not numeric_only else self._get_numeric_data() + + def f(s): + return s.mode(dropna=dropna) + + data = data.apply(f, axis=axis) + # Ensure index is type stable (should always use int index) + if data.empty: + data.index = default_index(0) + + return data + + @overload + def quantile( + self, + q: float = ..., + axis: Axis = ..., + numeric_only: bool = ..., + interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., + ) -> Series: + ... + + @overload + def quantile( + self, + q: AnyArrayLike | Sequence[float], + axis: Axis = ..., + numeric_only: bool = ..., + interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., + ) -> Series | DataFrame: + ... + + @overload + def quantile( + self, + q: float | AnyArrayLike | Sequence[float] = ..., + axis: Axis = ..., + numeric_only: bool = ..., + interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., + ) -> Series | DataFrame: + ... + + def quantile( + self, + q: float | AnyArrayLike | Sequence[float] = 0.5, + axis: Axis = 0, + numeric_only: bool = False, + interpolation: QuantileInterpolation = "linear", + method: Literal["single", "table"] = "single", + ) -> Series | DataFrame: + """ + Return values at the given quantile over requested axis. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + method : {'single', 'table'}, default 'single' + Whether to compute quantiles per-column ('single') or over all columns + ('table'). When 'table', the only allowed interpolation methods are + 'nearest', 'lower', and 'higher'. + + Returns + ------- + Series or DataFrame + + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + See Also + -------- + core.window.rolling.Rolling.quantile: Rolling quantile. + numpy.percentile: Numpy function to compute the percentile. + + Examples + -------- + >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + Specifying `method='table'` will compute the quantile over all columns. + + >>> df.quantile(.1, method="table", interpolation="nearest") + a 1 + b 1 + Name: 0.1, dtype: int64 + >>> df.quantile([.1, .5], method="table", interpolation="nearest") + a b + 0.1 1 1 + 0.5 3 100 + + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + ... 'B': [pd.Timestamp('2010'), + ... pd.Timestamp('2011')], + ... 'C': [pd.Timedelta('1 days'), + ... pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + """ + validate_percentile(q) + axis = self._get_axis_number(axis) + + if not is_list_like(q): + # BlockManager.quantile expects listlike, so we wrap and unwrap here + # error: List item 0 has incompatible type "float | ExtensionArray | + # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" + res_df = self.quantile( + [q], # type: ignore[list-item] + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation, + method=method, + ) + if method == "single": + res = res_df.iloc[0] + else: + # cannot directly iloc over sparse arrays + res = res_df.T.iloc[:, 0] + if axis == 1 and len(self) == 0: + # GH#41544 try to get an appropriate dtype + dtype = find_common_type(list(self.dtypes)) + if needs_i8_conversion(dtype): + return res.astype(dtype) + return res + + q = Index(q, dtype=np.float64) + data = self._get_numeric_data() if numeric_only else self + + if axis == 1: + data = data.T + + if len(data.columns) == 0: + # GH#23925 _get_numeric_data may have dropped all columns + cols = Index([], name=self.columns.name) + + dtype = np.float64 + if axis == 1: + # GH#41544 try to get an appropriate dtype + cdtype = find_common_type(list(self.dtypes)) + if needs_i8_conversion(cdtype): + dtype = cdtype + + res = self._constructor([], index=q, columns=cols, dtype=dtype) + return res.__finalize__(self, method="quantile") + + valid_method = {"single", "table"} + if method not in valid_method: + raise ValueError( + f"Invalid method: {method}. Method must be in {valid_method}." + ) + if method == "single": + res = data._mgr.quantile(qs=q, interpolation=interpolation) + elif method == "table": + valid_interpolation = {"nearest", "lower", "higher"} + if interpolation not in valid_interpolation: + raise ValueError( + f"Invalid interpolation: {interpolation}. " + f"Interpolation must be in {valid_interpolation}" + ) + # handle degenerate case + if len(data) == 0: + if data.ndim == 2: + dtype = find_common_type(list(self.dtypes)) + else: + dtype = self.dtype + return self._constructor([], index=q, columns=data.columns, dtype=dtype) + + q_idx = np.quantile(np.arange(len(data)), q, method=interpolation) + + by = data.columns + if len(by) > 1: + keys = [data._get_label_or_level_values(x) for x in by] + indexer = lexsort_indexer(keys) + else: + k = data._get_label_or_level_values(by[0]) + indexer = nargsort(k) + + res = data._mgr.take(indexer[q_idx], verify=False) + res.axes[1] = q + + result = self._constructor_from_mgr(res, axes=res.axes) + return result.__finalize__(self, method="quantile") + + def to_timestamp( + self, + freq: Frequency | None = None, + how: ToTimestampHow = "start", + axis: Axis = 0, + copy: bool | None = None, + ) -> DataFrame: + """ + Cast to DatetimeIndex of timestamps, at *beginning* of period. + + Parameters + ---------- + freq : str, default frequency of PeriodIndex + Desired frequency. + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + DataFrame + The DataFrame has a DatetimeIndex. + + Examples + -------- + >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = pd.DataFrame(data=d, index=idx) + >>> df1 + col1 col2 + 2023 1 3 + 2024 2 4 + + The resulting timestamps will be at the beginning of the year in this case + + >>> df1 = df1.to_timestamp() + >>> df1 + col1 col2 + 2023-01-01 1 3 + 2024-01-01 2 4 + >>> df1.index + DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None) + + Using `freq` which is the offset that the Timestamps will have + + >>> df2 = pd.DataFrame(data=d, index=idx) + >>> df2 = df2.to_timestamp(freq='M') + >>> df2 + col1 col2 + 2023-01-31 1 3 + 2024-01-31 2 4 + >>> df2.index + DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) + """ + new_obj = self.copy(deep=copy and not using_copy_on_write()) + + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + if not isinstance(old_ax, PeriodIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + + new_ax = old_ax.to_timestamp(freq=freq, how=how) + + setattr(new_obj, axis_name, new_ax) + return new_obj + + def to_period( + self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None + ) -> DataFrame: + """ + Convert DataFrame from DatetimeIndex to PeriodIndex. + + Convert DataFrame from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed). + + Parameters + ---------- + freq : str, default + Frequency of the PeriodIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + DataFrame + The DataFrame has a PeriodIndex. + + Examples + -------- + >>> idx = pd.to_datetime( + ... [ + ... "2001-03-31 00:00:00", + ... "2002-05-31 00:00:00", + ... "2003-08-31 00:00:00", + ... ] + ... ) + + >>> idx + DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], + dtype='datetime64[ns]', freq=None) + + >>> idx.to_period("M") + PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') + + For the yearly frequency + + >>> idx.to_period("Y") + PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') + """ + new_obj = self.copy(deep=copy and not using_copy_on_write()) + + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + if not isinstance(old_ax, DatetimeIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + + new_ax = old_ax.to_period(freq=freq) + + setattr(new_obj, axis_name, new_ax) + return new_obj + + def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: + """ + Whether each element in the DataFrame is contained in values. + + Parameters + ---------- + values : iterable, Series, DataFrame or dict + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dict, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. + + Returns + ------- + DataFrame + DataFrame of booleans showing whether each element in the DataFrame + is contained in values. + + See Also + -------- + DataFrame.eq: Equality test for DataFrame. + Series.isin: Equivalent method on Series. + Series.str.contains: Test if pattern or regex is contained within a + string of a Series or Index. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + + When ``values`` is a list check whether every value in the DataFrame + is present in the list (which animals have 0 or 2 legs or wings) + + >>> df.isin([0, 2]) + num_legs num_wings + falcon True True + dog False True + + To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: + + >>> ~df.isin([0, 2]) + num_legs num_wings + falcon False False + dog True False + + When ``values`` is a dict, we can pass values to check for each + column separately: + + >>> df.isin({'num_wings': [0, 3]}) + num_legs num_wings + falcon False False + dog False True + + When ``values`` is a Series or DataFrame the index and column must + match. Note that 'falcon' does not match based on the number of legs + in other. + + >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, + ... index=['spider', 'falcon']) + >>> df.isin(other) + num_legs num_wings + falcon False True + dog False False + """ + if isinstance(values, dict): + from pandas.core.reshape.concat import concat + + values = collections.defaultdict(list, values) + result = concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) + elif isinstance(values, Series): + if not values.index.is_unique: + raise ValueError("cannot compute isin with a duplicate axis.") + result = self.eq(values.reindex_like(self), axis="index") + elif isinstance(values, DataFrame): + if not (values.columns.is_unique and values.index.is_unique): + raise ValueError("cannot compute isin with a duplicate axis.") + result = self.eq(values.reindex_like(self)) + else: + if not is_list_like(values): + raise TypeError( + "only list-like or dict-like objects are allowed " + "to be passed to DataFrame.isin(), " + f"you passed a '{type(values).__name__}'" + ) + + def isin_(x): + # error: Argument 2 to "isin" has incompatible type "Union[Series, + # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected + # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, + # Series], List[Any], range]" + result = algorithms.isin( + x.ravel(), + values, # type: ignore[arg-type] + ) + return result.reshape(x.shape) + + res_mgr = self._mgr.apply(isin_) + result = self._constructor_from_mgr( + res_mgr, + axes=res_mgr.axes, + ) + return result.__finalize__(self, method="isin") + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + index = properties.AxisProperty( + axis=1, + doc=""" + The index (row labels) of the DataFrame. + + The index of a DataFrame is a series of labels that identify each row. + The labels can be integers, strings, or any other hashable type. The index + is used for label-based access and alignment, and can be accessed or + modified using this attribute. + + Returns + ------- + pandas.Index + The index labels of the DataFrame. + + See Also + -------- + DataFrame.columns : The column labels of the DataFrame. + DataFrame.to_numpy : Convert the DataFrame to a NumPy array. + + Examples + -------- + >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df.index + Index([10, 20, 30], dtype='int64') + + In this example, we create a DataFrame with 3 rows and 3 columns, + including Name, Age, and Location information. We set the index labels to + be the integers 10, 20, and 30. We then access the `index` attribute of the + DataFrame, which returns an `Index` object containing the index labels. + + >>> df.index = [100, 200, 300] + >>> df + Name Age Location + 100 Alice 25 Seattle + 200 Bob 30 New York + 300 Aritra 35 Kona + + In this example, we modify the index labels of the DataFrame by assigning + a new list of labels to the `index` attribute. The DataFrame is then + updated with the new labels, and the output shows the modified DataFrame. + """, + ) + columns = properties.AxisProperty( + axis=0, + doc=dedent( + """ + The column labels of the DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df + A B + 0 1 3 + 1 2 4 + >>> df.columns + Index(['A', 'B'], dtype='object') + """ + ), + ) + + # ---------------------------------------------------------------------- + # Add plotting methods to DataFrame + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + hist = pandas.plotting.hist_frame + boxplot = pandas.plotting.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) + + # ---------------------------------------------------------------------- + # Internal Interface Methods + + def _to_dict_of_blocks(self): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY - only works for BlockManager + """ + mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) + return { + k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) + for k, v, in mgr.to_dict().items() + } + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]]) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + return self._mgr.as_array() + + +def _from_nested_dict(data) -> collections.defaultdict: + new_data: collections.defaultdict = collections.defaultdict(dict) + for index, s in data.items(): + for col, v in s.items(): + new_data[col][index] = v + return new_data + + +def _reindex_for_setitem( + value: DataFrame | Series, index: Index +) -> tuple[ArrayLike, BlockValuesRefs | None]: + # reindex if necessary + + if value.index.equals(index) or not len(index): + if using_copy_on_write() and isinstance(value, Series): + return value._values, value._references + return value._values.copy(), None + + # GH#4107 + try: + reindexed_value = value.reindex(index)._values + except ValueError as err: + # raised in MultiIndex.from_tuples, see test_insert_error_msmgs + if not value.index.is_unique: + # duplicate axis + raise err + + raise TypeError( + "incompatible index of inserted column with frame index" + ) from err + return reindexed_value, None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..796357355fef43528d87fe8a7fa2820f3f8fc17a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/generic.py @@ -0,0 +1,13960 @@ +# pyright: reportPropertyTypeMismatch=false +from __future__ import annotations + +import collections +from copy import deepcopy +import datetime as dt +from functools import partial +import gc +from json import loads +import operator +import pickle +import re +import sys +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Literal, + NoReturn, + cast, + final, + overload, +) +import warnings +import weakref + +import numpy as np + +from pandas._config import ( + config, + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import lib +from pandas._libs.lib import is_range_indexer +from pandas._libs.tslibs import ( + Period, + Tick, + Timestamp, + to_offset, +) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas._typing import ( + AlignJoin, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + AxisInt, + CompressionOptions, + DtypeArg, + DtypeBackend, + DtypeObj, + FilePath, + FillnaOptions, + FloatFormatType, + FormattersType, + Frequency, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + InterpolateOptions, + IntervalClosedType, + JSONSerializable, + Level, + Manager, + NaPosition, + NDFrameT, + OpenFileErrors, + RandomState, + ReindexMethod, + Renamer, + Scalar, + Self, + SequenceNotStr, + SortKind, + StorageOptions, + Suffixes, + T, + TimeAmbiguous, + TimedeltaConvertibleTypes, + TimeNonexistent, + TimestampConvertibleTypes, + TimeUnit, + ValueKeyFunc, + WriteBuffer, + WriteExcelBuffer, + npt, +) +from pandas.compat import PYPY +from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import ( + AbstractMethodError, + ChainedAssignmentError, + InvalidIndexError, + SettingWithCopyError, + SettingWithCopyWarning, + _chained_assignment_method_msg, + _chained_assignment_warning_method_msg, + _check_cacher, +) +from pandas.util._decorators import ( + deprecate_nonkeyword_arguments, + doc, +) +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import ( + check_dtype_backend, + validate_ascending, + validate_bool_kwarg, + validate_fillna_kwargs, + validate_inclusive, +) + +from pandas.core.dtypes.astype import astype_is_view +from pandas.core.dtypes.common import ( + ensure_object, + ensure_platform_int, + ensure_str, + is_bool, + is_bool_dtype, + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_number, + is_numeric_dtype, + is_re_compilable, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.inference import ( + is_hashable, + is_nested_list_like, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import ( + algorithms as algos, + arraylike, + common, + indexing, + missing, + nanops, + sample, +) +from pandas.core.array_algos.replace import should_use_regex +from pandas.core.arrays import ExtensionArray +from pandas.core.base import PandasObject +from pandas.core.construction import extract_array +from pandas.core.flags import Flags +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + RangeIndex, + default_index, + ensure_index, +) +from pandas.core.internals import ( + ArrayManager, + BlockManager, + SingleArrayManager, +) +from pandas.core.internals.construction import ( + mgr_to_mgr, + ndarray_to_mgr, +) +from pandas.core.methods.describe import describe_ndframe +from pandas.core.missing import ( + clean_fill_method, + clean_reindex_fill_method, + find_valid_index, +) +from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import get_indexer_indexer +from pandas.core.window import ( + Expanding, + ExponentialMovingWindow, + Rolling, + Window, +) + +from pandas.io.formats.format import ( + DataFrameFormatter, + DataFrameRenderer, +) +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Mapping, + Sequence, + ) + + from pandas._libs.tslibs import BaseOffset + + from pandas import ( + DataFrame, + ExcelWriter, + HDFStore, + Series, + ) + from pandas.core.indexers.objects import BaseIndexer + from pandas.core.resample import Resampler + +# goal is to be able to define the docs close to function, while still being +# able to share +_shared_docs = {**_shared_docs} +_shared_doc_kwargs = { + "axes": "keywords for axes", + "klass": "Series/DataFrame", + "axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501 + "inplace": """ + inplace : bool, default False + If True, performs operation inplace and returns None.""", + "optional_by": """ + by : str or list of str + Name or list of names to sort by""", +} + + +bool_t = bool # Need alias because NDFrame has def bool: + + +class NDFrame(PandasObject, indexing.IndexingMixin): + """ + N-dimensional analogue of DataFrame. Store multi-dimensional in a + size-mutable, labeled data structure + + Parameters + ---------- + data : BlockManager + axes : list + copy : bool, default False + """ + + _internal_names: list[str] = [ + "_mgr", + "_cacher", + "_item_cache", + "_cache", + "_is_copy", + "_name", + "_metadata", + "_flags", + ] + _internal_names_set: set[str] = set(_internal_names) + _accessors: set[str] = set() + _hidden_attrs: frozenset[str] = frozenset([]) + _metadata: list[str] = [] + _is_copy: weakref.ReferenceType[NDFrame] | str | None = None + _mgr: Manager + _attrs: dict[Hashable, Any] + _typ: str + + # ---------------------------------------------------------------------- + # Constructors + + def __init__(self, data: Manager) -> None: + object.__setattr__(self, "_is_copy", None) + object.__setattr__(self, "_mgr", data) + object.__setattr__(self, "_item_cache", {}) + object.__setattr__(self, "_attrs", {}) + object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) + + @final + @classmethod + def _init_mgr( + cls, + mgr: Manager, + axes: dict[Literal["index", "columns"], Axes | None], + dtype: DtypeObj | None = None, + copy: bool_t = False, + ) -> Manager: + """passed a manager and a axes dict""" + for a, axe in axes.items(): + if axe is not None: + axe = ensure_index(axe) + bm_axis = cls._get_block_manager_axis(a) + mgr = mgr.reindex_axis(axe, axis=bm_axis) + + # make a copy if explicitly requested + if copy: + mgr = mgr.copy() + if dtype is not None: + # avoid further copies if we can + if ( + isinstance(mgr, BlockManager) + and len(mgr.blocks) == 1 + and mgr.blocks[0].values.dtype == dtype + ): + pass + else: + mgr = mgr.astype(dtype=dtype) + return mgr + + @final + def _as_manager(self, typ: str, copy: bool_t = True) -> Self: + """ + Private helper function to create a DataFrame with specific manager. + + Parameters + ---------- + typ : {"block", "array"} + copy : bool, default True + Only controls whether the conversion from Block->ArrayManager + copies the 1D arrays (to ensure proper/contiguous memory layout). + + Returns + ------- + DataFrame + New DataFrame using specified manager type. Is not guaranteed + to be a copy or not. + """ + new_mgr: Manager + new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy) + # fastpath of passing a manager doesn't check the option/manager class + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + + @final + @classmethod + def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: + """ + Construct a new object of this type from a Manager object and axes. + + Parameters + ---------- + mgr : Manager + Must have the same ndim as cls. + axes : list[Index] + + Notes + ----- + The axes must match mgr.axes, but are required for future-proofing + in the event that axes are refactored out of the Manager objects. + """ + obj = cls.__new__(cls) + NDFrame.__init__(obj, mgr) + return obj + + # ---------------------------------------------------------------------- + # attrs and flags + + @property + def attrs(self) -> dict[Hashable, Any]: + """ + Dictionary of global attributes of this dataset. + + .. warning:: + + attrs is experimental and may change without warning. + + See Also + -------- + DataFrame.flags : Global flags applying to this object. + + Notes + ----- + Many operations that create new datasets will copy ``attrs``. Copies + are always deep so that changing ``attrs`` will only affect the + present dataset. ``pandas.concat`` copies ``attrs`` only if all input + datasets have the same ``attrs``. + + Examples + -------- + For Series: + + >>> ser = pd.Series([1, 2, 3]) + >>> ser.attrs = {"A": [10, 20, 30]} + >>> ser.attrs + {'A': [10, 20, 30]} + + For DataFrame: + + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df.attrs = {"A": [10, 20, 30]} + >>> df.attrs + {'A': [10, 20, 30]} + """ + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Hashable, Any]) -> None: + self._attrs = dict(value) + + @final + @property + def flags(self) -> Flags: + """ + Get the properties associated with this pandas object. + + The available flags are + + * :attr:`Flags.allows_duplicate_labels` + + See Also + -------- + Flags : Flags that apply to pandas objects. + DataFrame.attrs : Global metadata applying to this dataset. + + Notes + ----- + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags + + + Flags can be get or set using ``.`` + + >>> df.flags.allows_duplicate_labels + True + >>> df.flags.allows_duplicate_labels = False + + Or by slicing with a key + + >>> df.flags["allows_duplicate_labels"] + False + >>> df.flags["allows_duplicate_labels"] = True + """ + return self._flags + + @final + def set_flags( + self, + *, + copy: bool_t = False, + allows_duplicate_labels: bool_t | None = None, + ) -> Self: + """ + Return a new object with updated flags. + + Parameters + ---------- + copy : bool, default False + Specify if a copy of the object should be made. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + allows_duplicate_labels : bool, optional + Whether the returned object allows duplicate labels. + + Returns + ------- + Series or DataFrame + The same type as the caller. + + See Also + -------- + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. + + Notes + ----- + This method returns a new object that's a view on the same data + as the input. Mutating the input or the output values will be reflected + in the other. + + This method is intended to be used in method chains. + + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags.allows_duplicate_labels + True + >>> df2 = df.set_flags(allows_duplicate_labels=False) + >>> df2.flags.allows_duplicate_labels + False + """ + df = self.copy(deep=copy and not using_copy_on_write()) + if allows_duplicate_labels is not None: + df.flags["allows_duplicate_labels"] = allows_duplicate_labels + return df + + @final + @classmethod + def _validate_dtype(cls, dtype) -> DtypeObj | None: + """validate the passed dtype""" + if dtype is not None: + dtype = pandas_dtype(dtype) + + # a compound dtype + if dtype.kind == "V": + raise NotImplementedError( + "compound dtypes are not implemented " + f"in the {cls.__name__} constructor" + ) + + return dtype + + # ---------------------------------------------------------------------- + # Construction + + @property + def _constructor(self) -> Callable[..., Self]: + """ + Used when a manipulation result has the same dimensions as the + original. + """ + raise AbstractMethodError(self) + + # ---------------------------------------------------------------------- + # Internals + + @final + @property + def _data(self): + # GH#33054 retained because some downstream packages uses this, + # e.g. fastparquet + # GH#33333 + warnings.warn( + f"{type(self).__name__}._data is deprecated and will be removed in " + "a future version. Use public APIs instead.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + return self._mgr + + # ---------------------------------------------------------------------- + # Axis + _AXIS_ORDERS: list[Literal["index", "columns"]] + _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0} + _info_axis_number: int + _info_axis_name: Literal["index", "columns"] + _AXIS_LEN: int + + @final + def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs): + """Return an axes dictionary for myself.""" + d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} + # error: Argument 1 to "update" of "MutableMapping" has incompatible type + # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]" + d.update(kwargs) # type: ignore[arg-type] + return d + + @final + @classmethod + def _get_axis_number(cls, axis: Axis) -> AxisInt: + try: + return cls._AXIS_TO_AXIS_NUMBER[axis] + except KeyError: + raise ValueError(f"No axis named {axis} for object type {cls.__name__}") + + @final + @classmethod + def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]: + axis_number = cls._get_axis_number(axis) + return cls._AXIS_ORDERS[axis_number] + + @final + def _get_axis(self, axis: Axis) -> Index: + axis_number = self._get_axis_number(axis) + assert axis_number in {0, 1} + return self.index if axis_number == 0 else self.columns + + @final + @classmethod + def _get_block_manager_axis(cls, axis: Axis) -> AxisInt: + """Map the axis to the block_manager axis.""" + axis = cls._get_axis_number(axis) + ndim = cls._AXIS_LEN + if ndim == 2: + # i.e. DataFrame + return 1 - axis + return axis + + @final + def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]: + # index or columns + axis_index = getattr(self, axis) + d = {} + prefix = axis[0] + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + key = f"{prefix}level_{i}" + level = i + + level_values = axis_index.get_level_values(level) + s = level_values.to_series() + s.index = axis_index + d[key] = s + + # put the index/columns itself in the dict + if isinstance(axis_index, MultiIndex): + dindex = axis_index + else: + dindex = axis_index.to_series() + + d[axis] = dindex + return d + + @final + def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]: + from pandas.core.computation.parsing import clean_column_name + + d: dict[str, Series | MultiIndex] = {} + for axis_name in self._AXIS_ORDERS: + d.update(self._get_axis_resolvers(axis_name)) + + return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} + + @final + def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. + Used in :meth:`DataFrame.eval`. + """ + from pandas.core.computation.parsing import clean_column_name + from pandas.core.series import Series + + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} + + return { + clean_column_name(k): Series( + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] + ).__finalize__(self) + for k, v in zip(self.columns, self._iter_column_arrays()) + if not isinstance(k, int) + } + + @final + @property + def _info_axis(self) -> Index: + return getattr(self, self._info_axis_name) + + def _is_view_after_cow_rules(self): + # Only to be used in cases of chained assignment checks, this is a + # simplified check that assumes that either the whole object is a view + # or a copy + if len(self._mgr.blocks) == 0: # type: ignore[union-attr] + return False + return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + + @property + def shape(self) -> tuple[int, ...]: + """ + Return a tuple of axis dimensions + """ + return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) + + @property + def axes(self) -> list[Index]: + """ + Return index label(s) of the internal NDFrame + """ + # we do it this way because if we have reversed axes, then + # the block manager shows then reversed + return [self._get_axis(a) for a in self._AXIS_ORDERS] + + @final + @property + def ndim(self) -> int: + """ + Return an int representing the number of axes / array dimensions. + + Return 1 if Series. Otherwise return 2 if DataFrame. + + See Also + -------- + ndarray.ndim : Number of array dimensions. + + Examples + -------- + >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.ndim + 1 + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.ndim + 2 + """ + return self._mgr.ndim + + @final + @property + def size(self) -> int: + """ + Return an int representing the number of elements in this object. + + Return the number of rows if Series. Otherwise return the number of + rows times number of columns if DataFrame. + + See Also + -------- + ndarray.size : Number of elements in the array. + + Examples + -------- + >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.size + 3 + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.size + 4 + """ + + return int(np.prod(self.shape)) + + def set_axis( + self, + labels, + *, + axis: Axis = 0, + copy: bool_t | None = None, + ) -> Self: + """ + Assign desired index to given axis. + + Indexes for%(extended_summary_sub)s row labels can be changed by assigning + a list-like or Index. + + Parameters + ---------- + labels : list-like, Index + The values for the new index. + + axis : %(axes_single_arg)s, default 0 + The axis to update. The value 0 identifies the rows. For `Series` + this parameter is unused and defaults to 0. + + copy : bool, default True + Whether to make a copy of the underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + %(klass)s + An object of type %(klass)s. + + See Also + -------- + %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. + """ + return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy) + + @final + def _set_axis_nocheck( + self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None + ): + if inplace: + setattr(self, self._get_axis_name(axis), labels) + else: + # With copy=False, we create a new object but don't copy the + # underlying data. + obj = self.copy(deep=copy and not using_copy_on_write()) + setattr(obj, obj._get_axis_name(axis), labels) + return obj + + @final + def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None: + """ + This is called from the cython code when we set the `index` attribute + directly, e.g. `series.index = [1, 2, 3]`. + """ + labels = ensure_index(labels) + self._mgr.set_axis(axis, labels) + self._clear_item_cache() + + @final + def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self: + """ + Interchange axes and swap values axes appropriately. + + .. deprecated:: 2.1.0 + ``swapaxes`` is deprecated and will be removed. + Please use ``transpose`` instead. + + Returns + ------- + same as input + + Examples + -------- + Please see examples for :meth:`DataFrame.transpose`. + """ + warnings.warn( + # GH#51946 + f"'{type(self).__name__}.swapaxes' is deprecated and " + "will be removed in a future version. " + f"Please use '{type(self).__name__}.transpose' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + i = self._get_axis_number(axis1) + j = self._get_axis_number(axis2) + + if i == j: + return self.copy(deep=copy and not using_copy_on_write()) + + mapping = {i: j, j: i} + + new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)] + new_values = self._values.swapaxes(i, j) # type: ignore[union-attr] + if self._mgr.is_single_block and isinstance(self._mgr, BlockManager): + # This should only get hit in case of having a single block, otherwise a + # copy is made, we don't have to set up references. + new_mgr = ndarray_to_mgr( + new_values, + new_axes[0], + new_axes[1], + dtype=None, + copy=False, + typ="block", + ) + assert isinstance(new_mgr, BlockManager) + assert isinstance(self._mgr, BlockManager) + new_mgr.blocks[0].refs = self._mgr.blocks[0].refs + new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0]) + if not using_copy_on_write() and copy is not False: + new_mgr = new_mgr.copy(deep=True) + + out = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return out.__finalize__(self, method="swapaxes") + + return self._constructor( + new_values, + *new_axes, + # The no-copy case for CoW is handled above + copy=False, + ).__finalize__(self, method="swapaxes") + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: + """ + Return {klass} with requested index / column level(s) removed. + + Parameters + ---------- + level : int, str, or list-like + If a string is given, must be the name of a level + If list-like, elements must be names or positional indexes + of levels. + + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + Axis along which the level(s) is removed: + + * 0 or 'index': remove level(s) in column. + * 1 or 'columns': remove level(s) in row. + + For `Series` this parameter is unused and defaults to 0. + + Returns + ------- + {klass} + {klass} with requested index / column level(s) removed. + + Examples + -------- + >>> df = pd.DataFrame([ + ... [1, 2, 3, 4], + ... [5, 6, 7, 8], + ... [9, 10, 11, 12] + ... ]).set_index([0, 1]).rename_axis(['a', 'b']) + + >>> df.columns = pd.MultiIndex.from_tuples([ + ... ('c', 'e'), ('d', 'f') + ... ], names=['level_1', 'level_2']) + + >>> df + level_1 c d + level_2 e f + a b + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + + >>> df.droplevel('a') + level_1 c d + level_2 e f + b + 2 3 4 + 6 7 8 + 10 11 12 + + >>> df.droplevel('level_2', axis=1) + level_1 c d + a b + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + """ + labels = self._get_axis(axis) + new_labels = labels.droplevel(level) + return self.set_axis(new_labels, axis=axis, copy=None) + + def pop(self, item: Hashable) -> Series | Any: + result = self[item] + del self[item] + + return result + + @final + def squeeze(self, axis: Axis | None = None): + """ + Squeeze 1 dimensional axis objects into scalars. + + Series or DataFrames with a single element are squeezed to a scalar. + DataFrames with a single column or a single row are squeezed to a + Series. Otherwise the object is unchanged. + + This method is most useful when you don't know if your + object is a Series or DataFrame, but you do know it has just a single + column. In that case you can safely call `squeeze` to ensure you have a + Series. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default None + A specific axis to squeeze. By default, all length-1 axes are + squeezed. For `Series` this parameter is unused and defaults to `None`. + + Returns + ------- + DataFrame, Series, or scalar + The projection after squeezing `axis` or all the axes. + + See Also + -------- + Series.iloc : Integer-location based indexing for selecting scalars. + DataFrame.iloc : Integer-location based indexing for selecting Series. + Series.to_frame : Inverse of DataFrame.squeeze for a + single-column DataFrame. + + Examples + -------- + >>> primes = pd.Series([2, 3, 5, 7]) + + Slicing might produce a Series with a single value: + + >>> even_primes = primes[primes % 2 == 0] + >>> even_primes + 0 2 + dtype: int64 + + >>> even_primes.squeeze() + 2 + + Squeezing objects with more than one value in every axis does nothing: + + >>> odd_primes = primes[primes % 2 == 1] + >>> odd_primes + 1 3 + 2 5 + 3 7 + dtype: int64 + + >>> odd_primes.squeeze() + 1 3 + 2 5 + 3 7 + dtype: int64 + + Squeezing is even more effective when used with DataFrames. + + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + >>> df + a b + 0 1 2 + 1 3 4 + + Slicing a single column will produce a DataFrame with the columns + having only one value: + + >>> df_a = df[['a']] + >>> df_a + a + 0 1 + 1 3 + + So the columns can be squeezed down, resulting in a Series: + + >>> df_a.squeeze('columns') + 0 1 + 1 3 + Name: a, dtype: int64 + + Slicing a single row from a single column will produce a single + scalar DataFrame: + + >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a + a + 0 1 + + Squeezing the rows produces a single scalar Series: + + >>> df_0a.squeeze('rows') + a 1 + Name: 0, dtype: int64 + + Squeezing all axes will project directly into a scalar: + + >>> df_0a.squeeze() + 1 + """ + axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),) + result = self.iloc[ + tuple( + 0 if i in axes and len(a) == 1 else slice(None) + for i, a in enumerate(self.axes) + ) + ] + if isinstance(result, NDFrame): + result = result.__finalize__(self, method="squeeze") + return result + + # ---------------------------------------------------------------------- + # Rename + + @final + def _rename( + self, + mapper: Renamer | None = None, + *, + index: Renamer | None = None, + columns: Renamer | None = None, + axis: Axis | None = None, + copy: bool_t | None = None, + inplace: bool_t = False, + level: Level | None = None, + errors: str = "ignore", + ) -> Self | None: + # called by Series.rename and DataFrame.rename + + if mapper is None and index is None and columns is None: + raise TypeError("must pass an index to rename") + + if index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + if mapper is not None: + raise TypeError( + "Cannot specify both 'mapper' and any of 'index' or 'columns'" + ) + else: + # use the mapper argument + if axis and self._get_axis_number(axis) == 1: + columns = mapper + else: + index = mapper + + self._check_inplace_and_allows_duplicate_labels(inplace) + result = self if inplace else self.copy(deep=copy and not using_copy_on_write()) + + for axis_no, replacements in enumerate((index, columns)): + if replacements is None: + continue + + ax = self._get_axis(axis_no) + f = common.get_rename_function(replacements) + + if level is not None: + level = ax._get_level_number(level) + + # GH 13473 + if not callable(replacements): + if ax._is_multi and level is not None: + indexer = ax.get_level_values(level).get_indexer_for(replacements) + else: + indexer = ax.get_indexer_for(replacements) + + if errors == "raise" and len(indexer[indexer == -1]): + missing_labels = [ + label + for index, label in enumerate(replacements) + if indexer[index] == -1 + ] + raise KeyError(f"{missing_labels} not found in axis") + + new_index = ax._transform_index(f, level=level) + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False) + result._clear_item_cache() + + if inplace: + self._update_inplace(result) + return None + else: + return result.__finalize__(self, method="rename") + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + columns=..., + axis: Axis = ..., + copy: bool_t | None = ..., + inplace: Literal[False] = ..., + ) -> Self: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + columns=..., + axis: Axis = ..., + copy: bool_t | None = ..., + inplace: Literal[True], + ) -> None: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + columns=..., + axis: Axis = ..., + copy: bool_t | None = ..., + inplace: bool_t = ..., + ) -> Self | None: + ... + + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = lib.no_default, + *, + index=lib.no_default, + columns=lib.no_default, + axis: Axis = 0, + copy: bool_t | None = None, + inplace: bool_t = False, + ) -> Self | None: + """ + Set the name of the axis for the index or columns. + + Parameters + ---------- + mapper : scalar, list-like, optional + Value to set the axis name attribute. + index, columns : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + Note that the ``columns`` parameter is not allowed if the + object is a Series. This parameter only apply for DataFrame + type objects. + + Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` + and/or ``columns``. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to rename. For `Series` this parameter is unused and defaults to 0. + copy : bool, default None + Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + inplace : bool, default False + Modifies the object directly, instead of creating a new Series + or DataFrame. + + Returns + ------- + Series, DataFrame, or None + The same type as the caller or None if ``inplace=True``. + + See Also + -------- + Series.rename : Alter Series index labels or name. + DataFrame.rename : Alter DataFrame index labels or name. + Index.rename : Set new names on index. + + Notes + ----- + ``DataFrame.rename_axis`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + The first calling convention will only modify the names of + the index and/or the names of the Index object that is the columns. + In this case, the parameter ``copy`` is ignored. + + The second calling convention will modify the names of the + corresponding index if mapper is a list or a scalar. + However, if mapper is dict-like or a function, it will use the + deprecated behavior of modifying the axis *labels*. + + We *highly* recommend using keyword arguments to clarify your + intent. + + Examples + -------- + **Series** + + >>> s = pd.Series(["dog", "cat", "monkey"]) + >>> s + 0 dog + 1 cat + 2 monkey + dtype: object + >>> s.rename_axis("animal") + animal + 0 dog + 1 cat + 2 monkey + dtype: object + + **DataFrame** + + >>> df = pd.DataFrame({"num_legs": [4, 4, 2], + ... "num_arms": [0, 0, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs num_arms + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("animal") + >>> df + num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("limbs", axis="columns") + >>> df + limbs num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + + **MultiIndex** + + >>> df.index = pd.MultiIndex.from_product([['mammal'], + ... ['dog', 'cat', 'monkey']], + ... names=['type', 'name']) + >>> df + limbs num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(index={'type': 'class'}) + limbs num_legs num_arms + class name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(columns=str.upper) + LIMBS num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + """ + axes = {"index": index, "columns": columns} + + if axis is not None: + axis = self._get_axis_number(axis) + + inplace = validate_bool_kwarg(inplace, "inplace") + + if copy and using_copy_on_write(): + copy = False + + if mapper is not lib.no_default: + # Use v0.23 behavior if a scalar or list + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) + if non_mapper: + return self._set_axis_name( + mapper, axis=axis, inplace=inplace, copy=copy + ) + else: + raise ValueError("Use `.rename` to alter labels with a mapper.") + else: + # Use new behavior. Means that index and/or columns + # is specified + result = self if inplace else self.copy(deep=copy) + + for axis in range(self._AXIS_LEN): + v = axes.get(self._get_axis_name(axis)) + if v is lib.no_default: + continue + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) + if non_mapper: + newnames = v + else: + f = common.get_rename_function(v) + curnames = self._get_axis(axis).names + newnames = [f(name) for name in curnames] + result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy) + if not inplace: + return result + return None + + @final + def _set_axis_name( + self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True + ): + """ + Set the name(s) of the axis. + + Parameters + ---------- + name : str or list of str + Name(s) to set. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to set the label. The value 0 or 'index' specifies index, + and the value 1 or 'columns' specifies columns. + inplace : bool, default False + If `True`, do operation inplace and return None. + copy: + Whether to make a copy of the result. + + Returns + ------- + Series, DataFrame, or None + The same type as the caller or `None` if `inplace` is `True`. + + See Also + -------- + DataFrame.rename : Alter the axis labels of :class:`DataFrame`. + Series.rename : Alter the index labels or set the index name + of :class:`Series`. + Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`. + + Examples + -------- + >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs + dog 4 + cat 4 + monkey 2 + >>> df._set_axis_name("animal") + num_legs + animal + dog 4 + cat 4 + monkey 2 + >>> df.index = pd.MultiIndex.from_product( + ... [["mammal"], ['dog', 'cat', 'monkey']]) + >>> df._set_axis_name(["type", "name"]) + num_legs + type name + mammal dog 4 + cat 4 + monkey 2 + """ + axis = self._get_axis_number(axis) + idx = self._get_axis(axis).set_names(name) + + inplace = validate_bool_kwarg(inplace, "inplace") + renamed = self if inplace else self.copy(deep=copy) + if axis == 0: + renamed.index = idx + else: + renamed.columns = idx + + if not inplace: + return renamed + + # ---------------------------------------------------------------------- + # Comparison Methods + + @final + def _indexed_same(self, other) -> bool_t: + return all( + self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS + ) + + @final + def equals(self, other: object) -> bool_t: + """ + Test whether two objects contain the same elements. + + This function allows two Series or DataFrames to be compared against + each other to see if they have the same shape and elements. NaNs in + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns and + index must be of the same dtype. + + Parameters + ---------- + other : Series or DataFrame + The other Series or DataFrame to be compared with the first. + + Returns + ------- + bool + True if all elements are the same in both objects, False + otherwise. + + See Also + -------- + Series.eq : Compare two Series objects of the same length + and return a Series where each element is True if the element + in each Series is equal, False otherwise. + DataFrame.eq : Compare two DataFrame objects of the same shape and + return a DataFrame where each element is True if the respective + element in each DataFrame is equal, False otherwise. + testing.assert_series_equal : Raises an AssertionError if left and + right are not equal. Provides an easy interface to ignore + inequality in dtypes, indexes and precision among others. + testing.assert_frame_equal : Like assert_series_equal, but targets + DataFrames. + numpy.array_equal : Return True if two arrays have the same shape + and elements, False otherwise. + + Examples + -------- + >>> df = pd.DataFrame({1: [10], 2: [20]}) + >>> df + 1 2 + 0 10 20 + + DataFrames df and exactly_equal have the same types and values for + their elements and column labels, which will return True. + + >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]}) + >>> exactly_equal + 1 2 + 0 10 20 + >>> df.equals(exactly_equal) + True + + DataFrames df and different_column_type have the same element + types and values, but have different types for the column labels, + which will still return True. + + >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]}) + >>> different_column_type + 1.0 2.0 + 0 10 20 + >>> df.equals(different_column_type) + True + + DataFrames df and different_data_type have different types for the + same values for their elements, and will return False even though + their column labels are the same values and types. + + >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]}) + >>> different_data_type + 1 2 + 0 10.0 20.0 + >>> df.equals(different_data_type) + False + """ + if not (isinstance(other, type(self)) or isinstance(self, type(other))): + return False + other = cast(NDFrame, other) + return self._mgr.equals(other._mgr) + + # ------------------------------------------------------------------------- + # Unary Methods + + @final + def __neg__(self) -> Self: + def blk_func(values: ArrayLike): + if is_bool_dtype(values.dtype): + # error: Argument 1 to "inv" has incompatible type "Union + # [ExtensionArray, ndarray[Any, Any]]"; expected + # "_SupportsInversion[ndarray[Any, dtype[bool_]]]" + return operator.inv(values) # type: ignore[arg-type] + else: + # error: Argument 1 to "neg" has incompatible type "Union + # [ExtensionArray, ndarray[Any, Any]]"; expected + # "_SupportsNeg[ndarray[Any, dtype[Any]]]" + return operator.neg(values) # type: ignore[arg-type] + + new_data = self._mgr.apply(blk_func) + res = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res.__finalize__(self, method="__neg__") + + @final + def __pos__(self) -> Self: + def blk_func(values: ArrayLike): + if is_bool_dtype(values.dtype): + return values.copy() + else: + # error: Argument 1 to "pos" has incompatible type "Union + # [ExtensionArray, ndarray[Any, Any]]"; expected + # "_SupportsPos[ndarray[Any, dtype[Any]]]" + return operator.pos(values) # type: ignore[arg-type] + + new_data = self._mgr.apply(blk_func) + res = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res.__finalize__(self, method="__pos__") + + @final + def __invert__(self) -> Self: + if not self.size: + # inv fails with 0 len + return self.copy(deep=False) + + new_data = self._mgr.apply(operator.invert) + res = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res.__finalize__(self, method="__invert__") + + @final + def __nonzero__(self) -> NoReturn: + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + + __bool__ = __nonzero__ + + @final + def bool(self) -> bool_t: + """ + Return the bool of a single element Series or DataFrame. + + .. deprecated:: 2.1.0 + + bool is deprecated and will be removed in future version of pandas. + For ``Series`` use ``pandas.Series.item``. + + This must be a boolean scalar value, either True or False. It will raise a + ValueError if the Series or DataFrame does not have exactly 1 element, or that + element is not boolean (integer values 0 and 1 will also raise an exception). + + Returns + ------- + bool + The value in the Series or DataFrame. + + See Also + -------- + Series.astype : Change the data type of a Series, including to boolean. + DataFrame.astype : Change the data type of a DataFrame, including to boolean. + numpy.bool_ : NumPy boolean data type, used by pandas for boolean values. + + Examples + -------- + The method will only work for single element objects with a boolean value: + + >>> pd.Series([True]).bool() # doctest: +SKIP + True + >>> pd.Series([False]).bool() # doctest: +SKIP + False + + >>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP + True + >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP + False + + This is an alternative method and will only work + for single element objects with a boolean value: + + >>> pd.Series([True]).item() # doctest: +SKIP + True + >>> pd.Series([False]).item() # doctest: +SKIP + False + """ + + warnings.warn( + f"{type(self).__name__}.bool is now deprecated and will be removed " + "in future version of pandas", + FutureWarning, + stacklevel=find_stack_level(), + ) + v = self.squeeze() + if isinstance(v, (bool, np.bool_)): + return bool(v) + elif is_scalar(v): + raise ValueError( + "bool cannot act on a non-boolean single element " + f"{type(self).__name__}" + ) + + self.__nonzero__() + # for mypy (__nonzero__ raises) + return True + + @final + def abs(self) -> Self: + """ + Return a Series/DataFrame with absolute numeric value of each element. + + This function only applies to elements that are all numeric. + + Returns + ------- + abs + Series/DataFrame containing the absolute value of each element. + + See Also + -------- + numpy.absolute : Calculate the absolute value element-wise. + + Notes + ----- + For ``complex`` inputs, ``1.2 + 1j``, the absolute value is + :math:`\\sqrt{ a^2 + b^2 }`. + + Examples + -------- + Absolute numeric values in a Series. + + >>> s = pd.Series([-1.10, 2, -3.33, 4]) + >>> s.abs() + 0 1.10 + 1 2.00 + 2 3.33 + 3 4.00 + dtype: float64 + + Absolute numeric values in a Series with complex numbers. + + >>> s = pd.Series([1.2 + 1j]) + >>> s.abs() + 0 1.56205 + dtype: float64 + + Absolute numeric values in a Series with a Timedelta element. + + >>> s = pd.Series([pd.Timedelta('1 days')]) + >>> s.abs() + 0 1 days + dtype: timedelta64[ns] + + Select rows with data closest to certain value using argsort (from + `StackOverflow `__). + + >>> df = pd.DataFrame({ + ... 'a': [4, 5, 6, 7], + ... 'b': [10, 20, 30, 40], + ... 'c': [100, 50, -30, -50] + ... }) + >>> df + a b c + 0 4 10 100 + 1 5 20 50 + 2 6 30 -30 + 3 7 40 -50 + >>> df.loc[(df.c - 43).abs().argsort()] + a b c + 1 5 20 50 + 0 4 10 100 + 2 6 30 -30 + 3 7 40 -50 + """ + res_mgr = self._mgr.apply(np.abs) + return self._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__( + self, name="abs" + ) + + @final + def __abs__(self) -> Self: + return self.abs() + + @final + def __round__(self, decimals: int = 0) -> Self: + return self.round(decimals).__finalize__(self, method="__round__") + + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + @final + def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t: + """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key : Hashable + Potential level name for the given axis + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level : bool + """ + axis_int = self._get_axis_number(axis) + + return ( + key is not None + and is_hashable(key) + and key in self.axes[axis_int].names + and not self._is_label_reference(key, axis=axis_int) + ) + + @final + def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t: + """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key : Hashable + Potential label name, i.e. Index entry. + axis : int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool + """ + axis_int = self._get_axis_number(axis) + other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int) + + return ( + key is not None + and is_hashable(key) + and any(key in self.axes[ax] for ax in other_axes) + ) + + @final + def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t: + """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key : Hashable + Potential label or level name + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + bool + """ + return self._is_level_reference(key, axis=axis) or self._is_label_reference( + key, axis=axis + ) + + @final + def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None: + """ + Check whether `key` is ambiguous. + + By ambiguous, we mean that it matches both a level of the input + `axis` and a label of the other axis. + + Parameters + ---------- + key : Hashable + Label or level name. + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns). + + Raises + ------ + ValueError: `key` is ambiguous + """ + + axis_int = self._get_axis_number(axis) + other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int) + + if ( + key is not None + and is_hashable(key) + and key in self.axes[axis_int].names + and any(key in self.axes[ax] for ax in other_axes) + ): + # Build an informative and grammatical warning + level_article, level_type = ( + ("an", "index") if axis_int == 0 else ("a", "column") + ) + + label_article, label_type = ( + ("a", "column") if axis_int == 0 else ("an", "index") + ) + + msg = ( + f"'{key}' is both {level_article} {level_type} level and " + f"{label_article} {label_type} label, which is ambiguous." + ) + raise ValueError(msg) + + @final + def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike: + """ + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key : Hashable + Label or level name. + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + np.ndarray or ExtensionArray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + multi_message = ( + "\n" + "For a multi-index, the label must be a " + "tuple with elements corresponding to each level." + ) + else: + multi_message = "" + + label_axis_name = "column" if axis == 0 else "index" + raise ValueError( + f"The {label_axis_name} label '{key}' is not unique.{multi_message}" + ) + + return values + + @final + def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): + """ + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys : str or list of str + labels or levels to drop + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level + """ + axis = self._get_axis_number(axis) + + # Validate keys + keys = common.maybe_make_list(keys) + invalid_keys = [ + k for k in keys if not self._is_label_or_level_reference(k, axis=axis) + ] + + if invalid_keys: + raise ValueError( + "The following keys are not valid labels or " + f"levels for axis {axis}: {invalid_keys}" + ) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy(deep=False) + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) + + return dropped + + # ---------------------------------------------------------------------- + # Iteration + + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: ClassVar[None] # type: ignore[assignment] + + def __iter__(self) -> Iterator: + """ + Iterate over info axis. + + Returns + ------- + iterator + Info axis as iterator. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + >>> for x in df: + ... print(x) + A + B + """ + return iter(self._info_axis) + + # can we get a better explanation of this? + def keys(self) -> Index: + """ + Get the 'info axis' (see Indexing for more). + + This is index for Series, columns for DataFrame. + + Returns + ------- + Index + Info axis. + + Examples + -------- + >>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]}, + ... index=['a', 'b', 'c']) + >>> d + A B + a 1 0 + b 2 4 + c 3 8 + >>> d.keys() + Index(['A', 'B'], dtype='object') + """ + return self._info_axis + + def items(self): + """ + Iterate over (label, values) on info axis + + This is index for Series and columns for DataFrame. + + Returns + ------- + Generator + """ + for h in self._info_axis: + yield h, self[h] + + def __len__(self) -> int: + """Returns length of info axis""" + return len(self._info_axis) + + @final + def __contains__(self, key) -> bool_t: + """True if the key is in the info axis""" + return key in self._info_axis + + @property + def empty(self) -> bool_t: + """ + Indicator whether Series/DataFrame is empty. + + True if Series/DataFrame is entirely empty (no items), meaning any of the + axes are of length 0. + + Returns + ------- + bool + If Series/DataFrame is empty, return True, if not return False. + + See Also + -------- + Series.dropna : Return series without null values. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + + Notes + ----- + If Series/DataFrame contains only NaNs, it is still not considered empty. See + the example below. + + Examples + -------- + An example of an actual empty DataFrame. Notice the index is empty: + + >>> df_empty = pd.DataFrame({'A' : []}) + >>> df_empty + Empty DataFrame + Columns: [A] + Index: [] + >>> df_empty.empty + True + + If we only have NaNs in our DataFrame, it is not considered empty! We + will need to drop the NaNs to make the DataFrame empty: + + >>> df = pd.DataFrame({'A' : [np.nan]}) + >>> df + A + 0 NaN + >>> df.empty + False + >>> df.dropna().empty + True + + >>> ser_empty = pd.Series({'A' : []}) + >>> ser_empty + A [] + dtype: object + >>> ser_empty.empty + False + >>> ser_empty = pd.Series() + >>> ser_empty.empty + True + """ + return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS) + + # ---------------------------------------------------------------------- + # Array Interface + + # This is also set in IndexOpsMixin + # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented + __array_priority__: int = 1000 + + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None + ) -> np.ndarray: + values = self._values + arr = np.asarray(values, dtype=dtype) + if ( + astype_is_view(values.dtype, arr.dtype) + and using_copy_on_write() + and self._mgr.is_single_block + ): + # Check if both conversions can be done without a copy + if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( + values.dtype, arr.dtype + ): + arr = arr.view() + arr.flags.writeable = False + return arr + + @final + def __array_ufunc__( + self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any + ): + return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) + + # ---------------------------------------------------------------------- + # Picklability + + @final + def __getstate__(self) -> dict[str, Any]: + meta = {k: getattr(self, k, None) for k in self._metadata} + return { + "_mgr": self._mgr, + "_typ": self._typ, + "_metadata": self._metadata, + "attrs": self.attrs, + "_flags": {k: self.flags[k] for k in self.flags._keys}, + **meta, + } + + @final + def __setstate__(self, state) -> None: + if isinstance(state, BlockManager): + self._mgr = state + elif isinstance(state, dict): + if "_data" in state and "_mgr" not in state: + # compat for older pickles + state["_mgr"] = state.pop("_data") + typ = state.get("_typ") + if typ is not None: + attrs = state.get("_attrs", {}) + if attrs is None: # should not happen, but better be on the safe side + attrs = {} + object.__setattr__(self, "_attrs", attrs) + flags = state.get("_flags", {"allows_duplicate_labels": True}) + object.__setattr__(self, "_flags", Flags(self, **flags)) + + # set in the order of internal names + # to avoid definitional recursion + # e.g. say fill_value needing _mgr to be + # defined + meta = set(self._internal_names + self._metadata) + for k in list(meta): + if k in state and k != "_flags": + v = state[k] + object.__setattr__(self, k, v) + + for k, v in state.items(): + if k not in meta: + object.__setattr__(self, k, v) + + else: + raise NotImplementedError("Pre-0.12 pickles are no longer supported") + elif len(state) == 2: + raise NotImplementedError("Pre-0.12 pickles are no longer supported") + + self._item_cache: dict[Hashable, Series] = {} + + # ---------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self) -> str: + # string representation based upon iterating over self + # (since, by definition, `PandasContainers` are iterable) + prepr = f"[{','.join(map(pprint_thing, self))}]" + return f"{type(self).__name__}({prepr})" + + @final + def _repr_latex_(self): + """ + Returns a LaTeX representation for a particular object. + Mainly for use with nbconvert (jupyter notebook conversion to pdf). + """ + if config.get_option("styler.render.repr") == "latex": + return self.to_latex() + else: + return None + + @final + def _repr_data_resource_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option("display.max_rows")) + + as_json = data.to_json(orient="table") + as_json = cast(str, as_json) + return loads(as_json, object_pairs_hook=collections.OrderedDict) + + # ---------------------------------------------------------------------- + # I/O Methods + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "excel_writer"], name="to_excel" + ) + @doc( + klass="object", + storage_options=_shared_docs["storage_options"], + storage_options_versionadded="1.2.0", + ) + def to_excel( + self, + excel_writer: FilePath | WriteExcelBuffer | ExcelWriter, + sheet_name: str = "Sheet1", + na_rep: str = "", + float_format: str | None = None, + columns: Sequence[Hashable] | None = None, + header: Sequence[Hashable] | bool_t = True, + index: bool_t = True, + index_label: IndexLabel | None = None, + startrow: int = 0, + startcol: int = 0, + engine: Literal["openpyxl", "xlsxwriter"] | None = None, + merge_cells: bool_t = True, + inf_rep: str = "inf", + freeze_panes: tuple[int, int] | None = None, + storage_options: StorageOptions | None = None, + engine_kwargs: dict[str, Any] | None = None, + ) -> None: + """ + Write {klass} to an Excel sheet. + + To write a single {klass} to an Excel .xlsx file it is only necessary to + specify a target file name. To write to multiple sheets it is necessary to + create an `ExcelWriter` object with a target file name, and specify a sheet + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. + + Parameters + ---------- + excel_writer : path-like, file-like, or ExcelWriter object + File path or existing ExcelWriter. + sheet_name : str, default 'Sheet1' + Name of sheet which will contain DataFrame. + na_rep : str, default '' + Missing data representation. + float_format : str, optional + Format string for floating point numbers. For example + ``float_format="%.2f"`` will format 0.1234 to 0.12. + columns : sequence or list of str, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of string is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, optional + Column label for index column(s) if desired. If not specified, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : int, default 0 + Upper left cell row to dump data frame. + startcol : int, default 0 + Upper left cell column to dump data frame. + engine : str, optional + Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this + via the options ``io.excel.xlsx.writer`` or + ``io.excel.xlsm.writer``. + + merge_cells : bool, default True + Write MultiIndex and Hierarchical Rows as merged cells. + inf_rep : str, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel). + freeze_panes : tuple of int (length 2), optional + Specifies the one-based bottommost row and rightmost column that + is to be frozen. + {storage_options} + + .. versionadded:: {storage_options_versionadded} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + ExcelWriter : Class for writing DataFrame objects into excel sheets. + read_excel : Read an Excel file into a pandas DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + io.formats.style.Styler.to_excel : Add styles to Excel sheet. + + Notes + ----- + For compatibility with :meth:`~DataFrame.to_csv`, + to_excel serializes lists and dicts to strings before writing. + + Once a workbook has been saved it is not possible to write further + data without rewriting the whole workbook. + + Examples + -------- + + Create, write to and save a workbook: + + >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df1.to_excel("output.xlsx") # doctest: +SKIP + + To specify the sheet name: + + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP + + If you wish to write to more than one sheet in the workbook, it is + necessary to specify an ExcelWriter object: + + >>> df2 = df1.copy() + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name='Sheet_name_1') + ... df2.to_excel(writer, sheet_name='Sheet_name_2') + + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name='Sheet_name_3') + + To set the library that is used to write the Excel file, + you can pass the `engine` keyword (the default engine is + automatically chosen depending on the file extension): + + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + """ + if engine_kwargs is None: + engine_kwargs = {} + + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + from pandas.io.formats.excel import ExcelFormatter + + formatter = ExcelFormatter( + df, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_json" + ) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buf", + ) + def to_json( + self, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + orient: Literal["split", "records", "index", "table", "columns", "values"] + | None = None, + date_format: str | None = None, + double_precision: int = 10, + force_ascii: bool_t = True, + date_unit: TimeUnit = "ms", + default_handler: Callable[[Any], JSONSerializable] | None = None, + lines: bool_t = False, + compression: CompressionOptions = "infer", + index: bool_t | None = None, + indent: int | None = None, + storage_options: StorageOptions | None = None, + mode: Literal["a", "w"] = "w", + ) -> str | None: + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. + orient : str + Indication of expected JSON string format. + + * Series: + + - default is 'index' + - allowed values are: {{'split', 'records', 'index', 'table'}}. + + * DataFrame: + + - default is 'columns' + - allowed values are: {{'split', 'records', 'index', 'columns', + 'values', 'table'}}. + + * The format of the JSON string: + + - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], + 'data' -> [values]}} + - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] + - 'index' : dict like {{index -> {{column -> value}}}} + - 'columns' : dict like {{column -> {{index -> value}}}} + - 'values' : just the values array + - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} + + Describing the data, where data component is like ``orient='records'``. + + date_format : {{None, 'epoch', 'iso'}} + Type of date conversion. 'epoch' = epoch milliseconds, + 'iso' = ISO8601. The default depends on the `orient`. For + ``orient='table'``, the default is 'iso'. For all other orients, + the default is 'epoch'. + double_precision : int, default 10 + The number of decimal places to use when encoding + floating point values. The possible maximal value is 15. + Passing double_precision greater than 15 will raise a ValueError. + force_ascii : bool, default True + Force encoded string to be ASCII. + date_unit : str, default 'ms' (milliseconds) + The time unit to encode to, governs timestamp and ISO8601 + precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, + microsecond, and nanosecond respectively. + default_handler : callable, default None + Handler to call if object cannot otherwise be converted to a + suitable format for JSON. Should receive a single argument which is + the object to convert and return a serialisable object. + lines : bool, default False + If 'orient' is 'records' write out line-delimited json format. Will + throw ValueError if incorrect 'orient' since others are not + list-like. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + index : bool or None, default None + The index is only used when 'orient' is 'split', 'index', 'column', + or 'table'. Of these, 'index' and 'column' do not support + `index=False`. + + indent : int, optional + Length of whitespace used to indent each record. + + {storage_options} + + mode : str, default 'w' (writing) + Specify the IO mode for output when supplying a path_or_buf. + Accepted args are 'w' (writing) and 'a' (append) only. + mode='a' is only supported when lines is True and orient is 'records'. + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. + + See Also + -------- + read_json : Convert a JSON string to pandas object. + + Notes + ----- + The behavior of ``indent=0`` varies from the stdlib, which does not + indent the output but does insert newlines. Currently, ``indent=0`` + and the default ``indent=None`` are equivalent in pandas, though this + may change in a future release. + + ``orient='table'`` contains a 'pandas_version' field under 'schema'. + This stores the version of `pandas` used in the latest revision of the + schema. + + Examples + -------- + >>> from json import loads, dumps + >>> df = pd.DataFrame( + ... [["a", "b"], ["c", "d"]], + ... index=["row 1", "row 2"], + ... columns=["col 1", "col 2"], + ... ) + + >>> result = df.to_json(orient="split") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + {{ + "columns": [ + "col 1", + "col 2" + ], + "index": [ + "row 1", + "row 2" + ], + "data": [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] + }} + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> result = df.to_json(orient="records") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + [ + {{ + "col 1": "a", + "col 2": "b" + }}, + {{ + "col 1": "c", + "col 2": "d" + }} + ] + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> result = df.to_json(orient="index") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + {{ + "row 1": {{ + "col 1": "a", + "col 2": "b" + }}, + "row 2": {{ + "col 1": "c", + "col 2": "d" + }} + }} + + Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: + + >>> result = df.to_json(orient="columns") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + {{ + "col 1": {{ + "row 1": "a", + "row 2": "c" + }}, + "col 2": {{ + "row 1": "b", + "row 2": "d" + }} + }} + + Encoding/decoding a Dataframe using ``'values'`` formatted JSON: + + >>> result = df.to_json(orient="values") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] + + Encoding with Table Schema: + + >>> result = df.to_json(orient="table") + >>> parsed = loads(result) + >>> dumps(parsed, indent=4) # doctest: +SKIP + {{ + "schema": {{ + "fields": [ + {{ + "name": "index", + "type": "string" + }}, + {{ + "name": "col 1", + "type": "string" + }}, + {{ + "name": "col 2", + "type": "string" + }} + ], + "primaryKey": [ + "index" + ], + "pandas_version": "1.4.0" + }}, + "data": [ + {{ + "index": "row 1", + "col 1": "a", + "col 2": "b" + }}, + {{ + "index": "row 2", + "col 1": "c", + "col 2": "d" + }} + ] + }} + """ + from pandas.io import json + + if date_format is None and orient == "table": + date_format = "iso" + elif date_format is None: + date_format = "epoch" + + config.is_nonnegative_int(indent) + indent = indent or 0 + + return json.to_json( + path_or_buf=path_or_buf, + obj=self, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, + indent=indent, + storage_options=storage_options, + mode=mode, + ) + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_hdf" + ) + def to_hdf( + self, + path_or_buf: FilePath | HDFStore, + key: str, + mode: Literal["a", "w", "r+"] = "a", + complevel: int | None = None, + complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None, + append: bool_t = False, + format: Literal["fixed", "table"] | None = None, + index: bool_t = True, + min_itemsize: int | dict[str, int] | None = None, + nan_rep=None, + dropna: bool_t | None = None, + data_columns: Literal[True] | list[str] | None = None, + errors: OpenFileErrors = "strict", + encoding: str = "UTF-8", + ) -> None: + """ + Write the contained data to an HDF5 file using HDFStore. + + Hierarchical Data Format (HDF) is self-describing, allowing an + application to interpret the structure and contents of a file with + no outside information. One HDF file can hold a mix of related objects + which can be accessed as a group or as individual objects. + + In order to add another DataFrame or Series to an existing HDF file + please use append mode and a different a key. + + .. warning:: + + One can store a subclass of ``DataFrame`` or ``Series`` to HDF5, + but the type of the subclass is lost upon storing. + + For more information see the :ref:`user guide `. + + Parameters + ---------- + path_or_buf : str or pandas.HDFStore + File path or HDFStore object. + key : str + Identifier for the group in the store. + mode : {'a', 'w', 'r+'}, default 'a' + Mode to open file: + + - 'w': write, a new file is created (an existing file with + the same name would be deleted). + - 'a': append, an existing file is opened for reading and + writing, and if the file does not exist it is created. + - 'r+': similar to 'a', but the file must already exist. + complevel : {0-9}, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' + Specifies the compression library to be used. + These additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. + append : bool, default False + For Table formats, append the input data to the existing. + format : {'fixed', 'table', None}, default 'fixed' + Possible values: + + - 'fixed': Fixed format. Fast writing/reading. Not-appendable, + nor searchable. + - 'table': Table format. Write as a PyTables Table structure + which may perform worse but allow more flexible operations + like searching / selecting subsets of the data. + - If None, pd.get_option('io.hdf.default_format') is checked, + followed by fallback to "fixed". + index : bool, default True + Write DataFrame index as a column. + min_itemsize : dict or int, optional + Map column names to minimum string sizes for columns. + nan_rep : Any, optional + How to represent null values as str. + Not allowed with append=True. + dropna : bool, default False, optional + Remove missing values. + data_columns : list of columns or True, optional + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See + :ref:`Query via data columns`. for + more information. + Applicable only to format='table'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + encoding : str, default "UTF-8" + + See Also + -------- + read_hdf : Read from HDF file. + DataFrame.to_orc : Write a DataFrame to the binary orc format. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_sql : Write to a SQL table. + DataFrame.to_feather : Write out feather-format for DataFrames. + DataFrame.to_csv : Write out to a csv file. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + ... index=['a', 'b', 'c']) # doctest: +SKIP + >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP + + We can add another object to the same file: + + >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP + >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP + + Reading from HDF file: + + >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP + A B + a 1 4 + b 2 5 + c 3 6 + >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + """ + from pandas.io import pytables + + # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected + # "Union[DataFrame, Series]" [arg-type] + pytables.to_hdf( + path_or_buf, + key, + self, # type: ignore[arg-type] + mode=mode, + complevel=complevel, + complib=complib, + append=append, + format=format, + index=index, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + dropna=dropna, + data_columns=data_columns, + errors=errors, + encoding=encoding, + ) + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "name", "con"], name="to_sql" + ) + def to_sql( + self, + name: str, + con, + schema: str | None = None, + if_exists: Literal["fail", "replace", "append"] = "fail", + index: bool_t = True, + index_label: IndexLabel | None = None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + method: Literal["multi"] | Callable | None = None, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Databases supported by SQLAlchemy [1]_ are supported. Tables can be + newly created, appended to, or overwritten. + + Parameters + ---------- + name : str + Name of SQL table. + con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable. See `here \ + `_. + If passing a sqlalchemy.engine.Connection which is already in a transaction, + the transaction will not be committed. If passing a sqlite3.Connection, + it will not be possible to roll back the record insertion. + + schema : str, optional + Specify the schema (if database flavor supports this). If None, use + default schema. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + How to behave if the table already exists. + + * fail: Raise a ValueError. + * replace: Drop the table before inserting new values. + * append: Insert new values to the existing table. + + index : bool, default True + Write DataFrame index as a column. Uses `index_label` as the column + name in the table. Creates a table index for this column. + index_label : str or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + Returns + ------- + None or int + Number of rows affected by to_sql. None is returned if the callable + passed into ``method`` does not return an integer number of rows. + + The number of returned rows affected is the sum of the ``rowcount`` + attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not + reflect the exact number of written rows as stipulated in the + `sqlite3 `__ or + `SQLAlchemy `__. + + .. versionadded:: 1.4.0 + + Raises + ------ + ValueError + When the table already exists and `if_exists` is 'fail' (the + default). + + See Also + -------- + read_sql : Read a DataFrame from a table. + + Notes + ----- + Timezone aware datetime columns will be written as + ``Timestamp with timezone`` type with SQLAlchemy if supported by the + database. Otherwise, the datetimes will be stored as timezone unaware + timestamps local to the original timezone. + + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + + References + ---------- + .. [1] https://docs.sqlalchemy.org + .. [2] https://www.python.org/dev/peps/pep-0249/ + + Examples + -------- + Create an in-memory SQLite database. + + >>> from sqlalchemy import create_engine + >>> engine = create_engine('sqlite://', echo=False) + + Create a table from scratch with 3 rows. + + >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']}) + >>> df + name + 0 User 1 + 1 User 2 + 2 User 3 + + >>> df.to_sql(name='users', con=engine) + 3 + >>> from sqlalchemy import text + >>> with engine.connect() as conn: + ... conn.execute(text("SELECT * FROM users")).fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] + + An `sqlalchemy.engine.Connection` can also be passed to `con`: + + >>> with engine.begin() as connection: + ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) + ... df1.to_sql(name='users', con=connection, if_exists='append') + 2 + + This is allowed to support operations that require that the same + DBAPI connection is used for the entire operation. + + >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']}) + >>> df2.to_sql(name='users', con=engine, if_exists='append') + 2 + >>> with engine.connect() as conn: + ... conn.execute(text("SELECT * FROM users")).fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'), + (0, 'User 4'), (1, 'User 5'), (0, 'User 6'), + (1, 'User 7')] + + Overwrite the table with just ``df2``. + + >>> df2.to_sql(name='users', con=engine, if_exists='replace', + ... index_label='id') + 2 + >>> with engine.connect() as conn: + ... conn.execute(text("SELECT * FROM users")).fetchall() + [(0, 'User 6'), (1, 'User 7')] + + Use ``method`` to define a callable insertion method to do nothing + if there's a primary key conflict on a table in a PostgreSQL database. + + >>> from sqlalchemy.dialects.postgresql import insert + >>> def insert_on_conflict_nothing(table, conn, keys, data_iter): + ... # "a" is the primary key in "conflict_table" + ... data = [dict(zip(keys, row)) for row in data_iter] + ... stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"]) + ... result = conn.execute(stmt) + ... return result.rowcount + >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing) # doctest: +SKIP + 0 + + For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict + on a primary key. + + >>> from sqlalchemy.dialects.mysql import insert + >>> def insert_on_conflict_update(table, conn, keys, data_iter): + ... # update columns "b" and "c" on primary key conflict + ... data = [dict(zip(keys, row)) for row in data_iter] + ... stmt = ( + ... insert(table.table) + ... .values(data) + ... ) + ... stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c) + ... result = conn.execute(stmt) + ... return result.rowcount + >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update) # doctest: +SKIP + 2 + + Specify the dtype (especially useful for integers with missing values). + Notice that while pandas is forced to store the data as floating point, + the database supports nullable integers. When fetching the data with + Python, we get back integer scalars. + + >>> df = pd.DataFrame({"A": [1, None, 2]}) + >>> df + A + 0 1.0 + 1 NaN + 2 2.0 + + >>> from sqlalchemy.types import Integer + >>> df.to_sql(name='integers', con=engine, index=False, + ... dtype={"A": Integer()}) + 3 + + >>> with engine.connect() as conn: + ... conn.execute(text("SELECT * FROM integers")).fetchall() + [(1,), (None,), (2,)] + """ # noqa: E501 + from pandas.io import sql + + return sql.to_sql( + self, + name, + con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_pickle" + ) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) + def to_pickle( + self, + path: FilePath | WriteBuffer[bytes], + compression: CompressionOptions = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions | None = None, + ) -> None: + """ + Pickle (serialize) object to file. + + Parameters + ---------- + path : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. File path where + the pickled object will be stored. + {compression_options} + protocol : int + Int which indicates which protocol should be used by the pickler, + default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible + values are 0, 1, 2, 3, 4, 5. A negative value for the protocol + parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + + .. [1] https://docs.python.org/3/library/pickle.html. + + {storage_options} + + See Also + -------- + read_pickle : Load pickled pandas object (or any object) from file. + DataFrame.to_hdf : Write DataFrame to an HDF5 file. + DataFrame.to_sql : Write DataFrame to a SQL database. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + + Examples + -------- + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df # doctest: +SKIP + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP + + >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df # doctest: +SKIP + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + """ # noqa: E501 + from pandas.io.pickle import to_pickle + + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_clipboard" + ) + def to_clipboard( + self, excel: bool_t = True, sep: str | None = None, **kwargs + ) -> None: + r""" + Copy object to the system clipboard. + + Write a text representation of object to the system clipboard. + This can be pasted into Excel, for example. + + Parameters + ---------- + excel : bool, default True + Produce output in a csv format for easy pasting into excel. + + - True, use the provided separator for csv pasting. + - False, write a string representation of the object to the clipboard. + + sep : str, default ``'\t'`` + Field delimiter. + **kwargs + These parameters will be passed to DataFrame.to_csv. + + See Also + -------- + DataFrame.to_csv : Write a DataFrame to a comma-separated values + (csv) file. + read_clipboard : Read text from clipboard and pass to read_csv. + + Notes + ----- + Requirements for your platform. + + - Linux : `xclip`, or `xsel` (with `PyQt4` modules) + - Windows : none + - macOS : none + + This method uses the processes developed for the package `pyperclip`. A + solution to render any output string format is given in the examples. + + Examples + -------- + Copy the contents of a DataFrame to the clipboard. + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + + >>> df.to_clipboard(sep=',') # doctest: +SKIP + ... # Wrote the following to the system clipboard: + ... # ,A,B,C + ... # 0,1,2,3 + ... # 1,4,5,6 + + We can omit the index by passing the keyword `index` and setting + it to false. + + >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP + ... # Wrote the following to the system clipboard: + ... # A,B,C + ... # 1,2,3 + ... # 4,5,6 + + Using the original `pyperclip` package for any string output format. + + .. code-block:: python + + import pyperclip + html = df.style.to_html() + pyperclip.copy(html) + """ + from pandas.io import clipboards + + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) + + @final + def to_xarray(self): + """ + Return an xarray object from the pandas object. + + Returns + ------- + xarray.DataArray or xarray.Dataset + Data in the pandas structure converted to Dataset if the object is + a DataFrame, or a DataArray if the object is a Series. + + See Also + -------- + DataFrame.to_hdf : Write DataFrame to an HDF5 file. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + + Notes + ----- + See the `xarray docs `__ + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), + ... ('parrot', 'bird', 24.0, 2), + ... ('lion', 'mammal', 80.5, 4), + ... ('monkey', 'mammal', np.nan, 4)], + ... columns=['name', 'class', 'max_speed', + ... 'num_legs']) + >>> df + name class max_speed num_legs + 0 falcon bird 389.0 2 + 1 parrot bird 24.0 2 + 2 lion mammal 80.5 4 + 3 monkey mammal NaN 4 + + >>> df.to_xarray() # doctest: +SKIP + + Dimensions: (index: 4) + Coordinates: + * index (index) int64 32B 0 1 2 3 + Data variables: + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 + + >>> df['max_speed'].to_xarray() # doctest: +SKIP + + array([389. , 24. , 80.5, nan]) + Coordinates: + * index (index) int64 0 1 2 3 + + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01', + ... '2018-01-02', '2018-01-02']) + >>> df_multiindex = pd.DataFrame({'date': dates, + ... 'animal': ['falcon', 'parrot', + ... 'falcon', 'parrot'], + ... 'speed': [350, 18, 361, 15]}) + >>> df_multiindex = df_multiindex.set_index(['date', 'animal']) + + >>> df_multiindex + speed + date animal + 2018-01-01 falcon 350 + parrot 18 + 2018-01-02 falcon 361 + parrot 15 + + >>> df_multiindex.to_xarray() # doctest: +SKIP + + Dimensions: (date: 2, animal: 2) + Coordinates: + * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * animal (animal) object 'falcon' 'parrot' + Data variables: + speed (date, animal) int64 350 18 361 15 + """ + xarray = import_optional_dependency("xarray") + + if self.ndim == 1: + return xarray.DataArray.from_series(self) + else: + return xarray.Dataset.from_dataframe(self) + + @overload + def to_latex( + self, + buf: None = ..., + columns: Sequence[Hashable] | None = ..., + header: bool_t | SequenceNotStr[str] = ..., + index: bool_t = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool_t | None = ..., + index_names: bool_t = ..., + bold_rows: bool_t = ..., + column_format: str | None = ..., + longtable: bool_t | None = ..., + escape: bool_t | None = ..., + encoding: str | None = ..., + decimal: str = ..., + multicolumn: bool_t | None = ..., + multicolumn_format: str | None = ..., + multirow: bool_t | None = ..., + caption: str | tuple[str, str] | None = ..., + label: str | None = ..., + position: str | None = ..., + ) -> str: + ... + + @overload + def to_latex( + self, + buf: FilePath | WriteBuffer[str], + columns: Sequence[Hashable] | None = ..., + header: bool_t | SequenceNotStr[str] = ..., + index: bool_t = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool_t | None = ..., + index_names: bool_t = ..., + bold_rows: bool_t = ..., + column_format: str | None = ..., + longtable: bool_t | None = ..., + escape: bool_t | None = ..., + encoding: str | None = ..., + decimal: str = ..., + multicolumn: bool_t | None = ..., + multicolumn_format: str | None = ..., + multirow: bool_t | None = ..., + caption: str | tuple[str, str] | None = ..., + label: str | None = ..., + position: str | None = ..., + ) -> None: + ... + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_latex" + ) + def to_latex( + self, + buf: FilePath | WriteBuffer[str] | None = None, + columns: Sequence[Hashable] | None = None, + header: bool_t | SequenceNotStr[str] = True, + index: bool_t = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool_t | None = None, + index_names: bool_t = True, + bold_rows: bool_t = False, + column_format: str | None = None, + longtable: bool_t | None = None, + escape: bool_t | None = None, + encoding: str | None = None, + decimal: str = ".", + multicolumn: bool_t | None = None, + multicolumn_format: str | None = None, + multirow: bool_t | None = None, + caption: str | tuple[str, str] | None = None, + label: str | None = None, + position: str | None = None, + ) -> str | None: + r""" + Render object to a LaTeX tabular, longtable, or nested table. + + Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted + into a main LaTeX document or read from an external file + with ``\input{{table.tex}}``. + + .. versionchanged:: 2.0.0 + Refactored to use the Styler implementation via jinja2 templating. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + columns : list of label, optional + The subset of columns to write. Writes all columns by default. + header : bool or list of str, default True + Write out the column names. If a list of strings is given, + it is assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + na_rep : str, default 'NaN' + Missing data representation. + formatters : list of functions or dict of {{str: function}}, optional + Formatter functions to apply to columns' elements by position or + name. The result of each function must be a unicode string. + List must be of length equal to the number of columns. + float_format : one-parameter function or str, optional, default None + Formatter for floating point numbers. For example + ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will + both result in 0.1234 being formatted as 0.12. + sparsify : bool, optional + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. By default, the value will be + read from the config module. + index_names : bool, default True + Prints the names of the indexes. + bold_rows : bool, default False + Make the row labels bold in the output. + column_format : str, optional + The columns format as specified in `LaTeX table format + `__ e.g. 'rcl' for 3 + columns. By default, 'l' will be used for all columns except + columns of numbers, which default to 'r'. + longtable : bool, optional + Use a longtable environment instead of tabular. Requires + adding a \usepackage{{longtable}} to your LaTeX preamble. + By default, the value will be read from the pandas config + module, and set to `True` if the option ``styler.latex.environment`` is + `"longtable"`. + + .. versionchanged:: 2.0.0 + The pandas option affecting this argument has changed. + escape : bool, optional + By default, the value will be read from the pandas config + module and set to `True` if the option ``styler.format.escape`` is + `"latex"`. When set to False prevents from escaping latex special + characters in column names. + + .. versionchanged:: 2.0.0 + The pandas option affecting this argument has changed, as has the + default value to `False`. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + multicolumn : bool, default True + Use \multicolumn to enhance MultiIndex columns. + The default will be read from the config module, and is set + as the option ``styler.sparse.columns``. + + .. versionchanged:: 2.0.0 + The pandas option affecting this argument has changed. + multicolumn_format : str, default 'r' + The alignment for multicolumns, similar to `column_format` + The default will be read from the config module, and is set as the option + ``styler.latex.multicol_align``. + + .. versionchanged:: 2.0.0 + The pandas option affecting this argument has changed, as has the + default value to "r". + multirow : bool, default True + Use \multirow to enhance MultiIndex rows. Requires adding a + \usepackage{{multirow}} to your LaTeX preamble. Will print + centered labels (instead of top-aligned) across the contained + rows, separating groups via clines. The default will be read + from the pandas config module, and is set as the option + ``styler.sparse.index``. + + .. versionchanged:: 2.0.0 + The pandas option affecting this argument has changed, as has the + default value to `True`. + caption : str or tuple, optional + Tuple (full_caption, short_caption), + which results in ``\caption[short_caption]{{full_caption}}``; + if a single string is passed, no short caption will be set. + label : str, optional + The LaTeX label to be placed inside ``\label{{}}`` in the output. + This is used with ``\ref{{}}`` in the main ``.tex`` file. + + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{{}}`` in the output. + + Returns + ------- + str or None + If buf is None, returns the result as a string. Otherwise returns None. + + See Also + -------- + io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX + with conditional formatting. + DataFrame.to_string : Render a DataFrame to a console-friendly + tabular output. + DataFrame.to_html : Render a DataFrame as an HTML table. + + Notes + ----- + As of v2.0.0 this method has changed to use the Styler implementation as + part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means + that ``jinja2`` is a requirement, and needs to be installed, for this method + to function. It is advised that users switch to using Styler, since that + implementation is more frequently updated and contains much more + flexibility with the output. + + Examples + -------- + Convert a general DataFrame to LaTeX with formatting: + + >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], + ... age=[26, 45], + ... height=[181.23, 177.65])) + >>> print(df.to_latex(index=False, + ... formatters={"name": str.upper}, + ... float_format="{:.1f}".format, + ... )) # doctest: +SKIP + \begin{tabular}{lrr} + \toprule + name & age & height \\ + \midrule + RAPHAEL & 26 & 181.2 \\ + DONATELLO & 45 & 177.7 \\ + \bottomrule + \end{tabular} + """ + # Get defaults from the pandas config + if self.ndim == 1: + self = self.to_frame() + if longtable is None: + longtable = config.get_option("styler.latex.environment") == "longtable" + if escape is None: + escape = config.get_option("styler.format.escape") == "latex" + if multicolumn is None: + multicolumn = config.get_option("styler.sparse.columns") + if multicolumn_format is None: + multicolumn_format = config.get_option("styler.latex.multicol_align") + if multirow is None: + multirow = config.get_option("styler.sparse.index") + + if column_format is not None and not isinstance(column_format, str): + raise ValueError("`column_format` must be str or unicode") + length = len(self.columns) if columns is None else len(columns) + if isinstance(header, (list, tuple)) and len(header) != length: + raise ValueError(f"Writing {length} cols but got {len(header)} aliases") + + # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure + base_format_ = { + "na_rep": na_rep, + "escape": "latex" if escape else None, + "decimal": decimal, + } + index_format_: dict[str, Any] = {"axis": 0, **base_format_} + column_format_: dict[str, Any] = {"axis": 1, **base_format_} + + if isinstance(float_format, str): + float_format_: Callable | None = lambda x: float_format % x + else: + float_format_ = float_format + + def _wrap(x, alt_format_): + if isinstance(x, (float, complex)) and float_format_ is not None: + return float_format_(x) + else: + return alt_format_(x) + + formatters_: list | tuple | dict | Callable | None = None + if isinstance(formatters, list): + formatters_ = { + c: partial(_wrap, alt_format_=formatters[i]) + for i, c in enumerate(self.columns) + } + elif isinstance(formatters, dict): + index_formatter = formatters.pop("__index__", None) + column_formatter = formatters.pop("__columns__", None) + if index_formatter is not None: + index_format_.update({"formatter": index_formatter}) + if column_formatter is not None: + column_format_.update({"formatter": column_formatter}) + + formatters_ = formatters + float_columns = self.select_dtypes(include="float").columns + for col in float_columns: + if col not in formatters.keys(): + formatters_.update({col: float_format_}) + elif formatters is None and float_format is not None: + formatters_ = partial(_wrap, alt_format_=lambda v: v) + format_index_ = [index_format_, column_format_] + + # Deal with hiding indexes and relabelling column names + hide_: list[dict] = [] + relabel_index_: list[dict] = [] + if columns: + hide_.append( + { + "subset": [c for c in self.columns if c not in columns], + "axis": "columns", + } + ) + if header is False: + hide_.append({"axis": "columns"}) + elif isinstance(header, (list, tuple)): + relabel_index_.append({"labels": header, "axis": "columns"}) + format_index_ = [index_format_] # column_format is overwritten + + if index is False: + hide_.append({"axis": "index"}) + if index_names is False: + hide_.append({"names": True, "axis": "index"}) + + render_kwargs_ = { + "hrules": True, + "sparse_index": sparsify, + "sparse_columns": sparsify, + "environment": "longtable" if longtable else None, + "multicol_align": multicolumn_format + if multicolumn + else f"naive-{multicolumn_format}", + "multirow_align": "t" if multirow else "naive", + "encoding": encoding, + "caption": caption, + "label": label, + "position": position, + "column_format": column_format, + "clines": "skip-last;data" + if (multirow and isinstance(self.index, MultiIndex)) + else None, + "bold_rows": bold_rows, + } + + return self._to_latex_via_styler( + buf, + hide=hide_, + relabel_index=relabel_index_, + format={"formatter": formatters_, **base_format_}, + format_index=format_index_, + render_kwargs=render_kwargs_, + ) + + @final + def _to_latex_via_styler( + self, + buf=None, + *, + hide: dict | list[dict] | None = None, + relabel_index: dict | list[dict] | None = None, + format: dict | list[dict] | None = None, + format_index: dict | list[dict] | None = None, + render_kwargs: dict | None = None, + ): + """ + Render object to a LaTeX tabular, longtable, or nested table. + + Uses the ``Styler`` implementation with the following, ordered, method chaining: + + .. code-block:: python + styler = Styler(DataFrame) + styler.hide(**hide) + styler.relabel_index(**relabel_index) + styler.format(**format) + styler.format_index(**format_index) + styler.to_latex(buf=buf, **render_kwargs) + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + hide : dict, list of dict + Keyword args to pass to the method call of ``Styler.hide``. If a list will + call the method numerous times. + relabel_index : dict, list of dict + Keyword args to pass to the method of ``Styler.relabel_index``. If a list + will call the method numerous times. + format : dict, list of dict + Keyword args to pass to the method call of ``Styler.format``. If a list will + call the method numerous times. + format_index : dict, list of dict + Keyword args to pass to the method call of ``Styler.format_index``. If a + list will call the method numerous times. + render_kwargs : dict + Keyword args to pass to the method call of ``Styler.to_latex``. + + Returns + ------- + str or None + If buf is None, returns the result as a string. Otherwise returns None. + """ + from pandas.io.formats.style import Styler + + self = cast("DataFrame", self) + styler = Styler(self, uuid="") + + for kw_name in ["hide", "relabel_index", "format", "format_index"]: + kw = vars()[kw_name] + if isinstance(kw, dict): + getattr(styler, kw_name)(**kw) + elif isinstance(kw, list): + for sub_kw in kw: + getattr(styler, kw_name)(**sub_kw) + + # bold_rows is not a direct kwarg of Styler.to_latex + render_kwargs = {} if render_kwargs is None else render_kwargs + if render_kwargs.pop("bold_rows"): + styler.map_index(lambda v: "textbf:--rwrap;") + + return styler.to_latex(buf=buf, **render_kwargs) + + @overload + def to_csv( + self, + path_or_buf: None = ..., + sep: str = ..., + na_rep: str = ..., + float_format: str | Callable | None = ..., + columns: Sequence[Hashable] | None = ..., + header: bool_t | list[str] = ..., + index: bool_t = ..., + index_label: IndexLabel | None = ..., + mode: str = ..., + encoding: str | None = ..., + compression: CompressionOptions = ..., + quoting: int | None = ..., + quotechar: str = ..., + lineterminator: str | None = ..., + chunksize: int | None = ..., + date_format: str | None = ..., + doublequote: bool_t = ..., + escapechar: str | None = ..., + decimal: str = ..., + errors: OpenFileErrors = ..., + storage_options: StorageOptions = ..., + ) -> str: + ... + + @overload + def to_csv( + self, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + sep: str = ..., + na_rep: str = ..., + float_format: str | Callable | None = ..., + columns: Sequence[Hashable] | None = ..., + header: bool_t | list[str] = ..., + index: bool_t = ..., + index_label: IndexLabel | None = ..., + mode: str = ..., + encoding: str | None = ..., + compression: CompressionOptions = ..., + quoting: int | None = ..., + quotechar: str = ..., + lineterminator: str | None = ..., + chunksize: int | None = ..., + date_format: str | None = ..., + doublequote: bool_t = ..., + escapechar: str | None = ..., + decimal: str = ..., + errors: OpenFileErrors = ..., + storage_options: StorageOptions = ..., + ) -> None: + ... + + @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv" + ) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buf", + ) + def to_csv( + self, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + sep: str = ",", + na_rep: str = "", + float_format: str | Callable | None = None, + columns: Sequence[Hashable] | None = None, + header: bool_t | list[str] = True, + index: bool_t = True, + index_label: IndexLabel | None = None, + mode: str = "w", + encoding: str | None = None, + compression: CompressionOptions = "infer", + quoting: int | None = None, + quotechar: str = '"', + lineterminator: str | None = None, + chunksize: int | None = None, + date_format: str | None = None, + doublequote: bool_t = True, + escapechar: str | None = None, + decimal: str = ".", + errors: OpenFileErrors = "strict", + storage_options: StorageOptions | None = None, + ) -> str | None: + r""" + Write object to a comma-separated values (csv) file. + + Parameters + ---------- + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` might need to contain a `'b'`. + sep : str, default ',' + String of length 1. Field delimiter for the output file. + na_rep : str, default '' + Missing data representation. + float_format : str, Callable, default None + Format string for floating point numbers. If a Callable is given, it takes + precedence over other numeric formatting parameters, like decimal. + columns : sequence, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R. + mode : {{'w', 'x', 'a'}}, default 'w' + Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control + the file opening. Typical values include: + + - 'w', truncate the file first. + - 'x', exclusive creation, failing if the file already exists. + - 'a', append to the end of file if it exists. + + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` + is a non-binary file object. + {compression_options} + + May be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. + + Passing compression options as keys in dict is + supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'. + quoting : optional constant from csv module + Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric. + quotechar : str, default '\"' + String of length 1. Character used to quote fields. + lineterminator : str, optional + The newline character or character sequence to use in the output + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.). + + .. versionchanged:: 1.5.0 + + Previously was line_terminator, changed for consistency with + read_csv and the standard library 'csv' module. + + chunksize : int or None + Rows to write at a time. + date_format : str, default None + Format string for datetime objects. + doublequote : bool, default True + Control quoting of `quotechar` inside a field. + escapechar : str, default None + String of length 1. Character used to escape `sep` and `quotechar` + when appropriate. + decimal : str, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + + {storage_options} + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting csv format as a + string. Otherwise returns None. + + See Also + -------- + read_csv : Load a CSV file into a DataFrame. + to_excel : Write DataFrame to an Excel file. + + Examples + -------- + Create 'out.csv' containing 'df' without indices + + >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}}) + >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + + Create 'out.zip' containing 'out.csv' + + >>> df.to_csv(index=False) + 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP + + To write a csv file to a new folder or nested folder you will first + need to create it using either Pathlib or os: + + >>> from pathlib import Path # doctest: +SKIP + >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP + >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP + >>> df.to_csv(filepath) # doctest: +SKIP + + >>> import os # doctest: +SKIP + >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP + >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP + """ + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + formatter = DataFrameFormatter( + frame=df, + header=header, + index=index, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + ) + + return DataFrameRenderer(formatter).to_csv( + path_or_buf, + lineterminator=lineterminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + columns=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + ) + + # ---------------------------------------------------------------------- + # Lookup Caching + + def _reset_cacher(self) -> None: + """ + Reset the cacher. + """ + raise AbstractMethodError(self) + + def _maybe_update_cacher( + self, + clear: bool_t = False, + verify_is_copy: bool_t = True, + inplace: bool_t = False, + ) -> None: + """ + See if we need to update our parent cacher if clear, then clear our + cache. + + Parameters + ---------- + clear : bool, default False + Clear the item cache. + verify_is_copy : bool, default True + Provide is_copy checks. + """ + if using_copy_on_write(): + return + + if verify_is_copy: + self._check_setitem_copy(t="referent") + + if clear: + self._clear_item_cache() + + def _clear_item_cache(self) -> None: + raise AbstractMethodError(self) + + # ---------------------------------------------------------------------- + # Indexing Methods + + @final + def take(self, indices, axis: Axis = 0, **kwargs) -> Self: + """ + Return the elements in the given *positional* indices along an axis. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + For `Series` this parameter is unused and defaults to 0. + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + same type as caller + An array-like containing the elements taken from the object. + + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=['name', 'class', 'max_speed'], + ... index=[0, 2, 3, 1]) + >>> df + name class max_speed + 0 falcon bird 389.0 + 2 parrot bird 24.0 + 3 lion mammal 80.5 + 1 monkey mammal NaN + + Take elements at positions 0 and 3 along the axis 0 (default). + + Note how the actual indices selected (0 and 1) do not correspond to + our selected indices 0 and 3. That's because we are selecting the 0th + and 3rd rows, not rows whose indices equal 0 and 3. + + >>> df.take([0, 3]) + name class max_speed + 0 falcon bird 389.0 + 1 monkey mammal NaN + + Take elements at indices 1 and 2 along the axis 1 (column selection). + + >>> df.take([1, 2], axis=1) + class max_speed + 0 bird 389.0 + 2 bird 24.0 + 3 mammal 80.5 + 1 mammal NaN + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> df.take([-1, -2]) + name class max_speed + 1 monkey mammal NaN + 3 lion mammal 80.5 + """ + + nv.validate_take((), kwargs) + + if not isinstance(indices, slice): + indices = np.asarray(indices, dtype=np.intp) + if ( + axis == 0 + and indices.ndim == 1 + and using_copy_on_write() + and is_range_indexer(indices, len(self)) + ): + return self.copy(deep=None) + elif self.ndim == 1: + raise TypeError( + f"{type(self).__name__}.take requires a sequence of integers, " + "not slice." + ) + else: + warnings.warn( + # GH#51539 + f"Passing a slice to {type(self).__name__}.take is deprecated " + "and will raise in a future version. Use `obj[slicer]` or pass " + "a sequence of integers instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # We can get here with a slice via DataFrame.__getitem__ + indices = np.arange( + indices.start, indices.stop, indices.step, dtype=np.intp + ) + + new_data = self._mgr.take( + indices, + axis=self._get_block_manager_axis(axis), + verify=True, + ) + return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__( + self, method="take" + ) + + @final + def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self: + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). + + For Series this does the same as the public take (it never sets `_is_copy`). + + See the docstring of `take` for full explanation of the parameters. + """ + result = self.take(indices=indices, axis=axis) + # Maybe set copy if we didn't actually change the index. + if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + return result + + @final + def xs( + self, + key: IndexLabel, + axis: Axis = 0, + level: IndexLabel | None = None, + drop_level: bool_t = True, + ) -> Self: + """ + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. + + Parameters + ---------- + key : label or tuple of label + Label contained in the index, or partially in a MultiIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to retrieve cross-section on. + level : object, defaults to first n levels (n=1 or len(key)) + In case of a key partially contained in a MultiIndex, indicate + which levels are used. Levels can be referred by label or position. + drop_level : bool, default True + If False, returns object with same levels as self. + + Returns + ------- + Series or DataFrame + Cross-section from the original Series or DataFrame + corresponding to the selected index levels. + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + DataFrame.iloc : Purely integer-location based indexing + for selection by position. + + Notes + ----- + `xs` can not be used to set values. + + MultiIndex Slicers is a generic way to get/set values on + any level or levels. + It is a superset of `xs` functionality, see + :ref:`MultiIndex Slicers `. + + Examples + -------- + >>> d = {'num_legs': [4, 4, 2, 2], + ... 'num_wings': [0, 0, 2, 2], + ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], + ... 'animal': ['cat', 'dog', 'bat', 'penguin'], + ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> df = pd.DataFrame(data=d) + >>> df = df.set_index(['class', 'animal', 'locomotion']) + >>> df + num_legs num_wings + class animal locomotion + mammal cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + bird penguin walks 2 2 + + Get values at specified index + + >>> df.xs('mammal') + num_legs num_wings + animal locomotion + cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + + Get values at several indexes + + >>> df.xs(('mammal', 'dog', 'walks')) + num_legs 4 + num_wings 0 + Name: (mammal, dog, walks), dtype: int64 + + Get values at specified index and level + + >>> df.xs('cat', level=1) + num_legs num_wings + class locomotion + mammal walks 4 0 + + Get values at several indexes and levels + + >>> df.xs(('bird', 'walks'), + ... level=[0, 'locomotion']) + num_legs num_wings + animal + penguin 2 2 + + Get values at specified column and axis + + >>> df.xs('num_wings', axis=1) + class animal locomotion + mammal cat walks 0 + dog walks 0 + bat flies 2 + bird penguin walks 2 + Name: num_wings, dtype: int64 + """ + axis = self._get_axis_number(axis) + labels = self._get_axis(axis) + + if isinstance(key, list): + raise TypeError("list keys are not supported in xs, pass a tuple instead") + + if level is not None: + if not isinstance(labels, MultiIndex): + raise TypeError("Index must be a MultiIndex") + loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) + + # create the tuple of the indexer + _indexer = [slice(None)] * self.ndim + _indexer[axis] = loc + indexer = tuple(_indexer) + + result = self.iloc[indexer] + setattr(result, result._get_axis_name(axis), new_ax) + return result + + if axis == 1: + if drop_level: + return self[key] + index = self.columns + else: + index = self.index + + if isinstance(index, MultiIndex): + loc, new_index = index._get_loc_level(key, level=0) + if not drop_level: + if lib.is_integer(loc): + # Slice index must be an integer or None + new_index = index[loc : loc + 1] + else: + new_index = index[loc] + else: + loc = index.get_loc(key) + + if isinstance(loc, np.ndarray): + if loc.dtype == np.bool_: + (inds,) = loc.nonzero() + return self._take_with_is_copy(inds, axis=axis) + else: + return self._take_with_is_copy(loc, axis=axis) + + if not is_scalar(loc): + new_index = index[loc] + + if is_scalar(loc) and axis == 0: + # In this case loc should be an integer + if self.ndim == 1: + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + return self._values[loc] + + new_mgr = self._mgr.fast_xs(loc) + + result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) + result._name = self.index[loc] + result = result.__finalize__(self) + elif is_scalar(loc): + result = self.iloc[:, slice(loc, loc + 1)] + elif axis == 1: + result = self.iloc[:, loc] + else: + result = self.iloc[loc] + result.index = new_index + + # this could be a view + # but only in a single-dtyped view sliceable case + result._set_is_copy(self, copy=not result._is_view) + return result + + def __getitem__(self, item): + raise AbstractMethodError(self) + + @final + def _getitem_slice(self, key: slice) -> Self: + """ + __getitem__ for the case where the key is a slice object. + """ + # _convert_slice_indexer to determine if this slice is positional + # or label based, and if the latter, convert to positional + slobj = self.index._convert_slice_indexer(key, kind="getitem") + if isinstance(slobj, np.ndarray): + # reachable with DatetimeIndex + indexer = lib.maybe_indices_to_slice( + slobj.astype(np.intp, copy=False), len(self) + ) + if isinstance(indexer, np.ndarray): + # GH#43223 If we can not convert, use take + return self.take(indexer, axis=0) + slobj = indexer + return self._slice(slobj) + + def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self: + """ + Construct a slice of this container. + + Slicing with this method is *always* positional. + """ + assert isinstance(slobj, slice), type(slobj) + axis = self._get_block_manager_axis(axis) + new_mgr = self._mgr.get_slice(slobj, axis=axis) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + + # this could be a view + # but only in a single-dtyped view sliceable case + is_copy = axis != 0 or result._is_view + result._set_is_copy(self, copy=is_copy) + return result + + @final + def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None: + if not copy: + self._is_copy = None + else: + assert ref is not None + self._is_copy = weakref.ref(ref) + + def _check_is_chained_assignment_possible(self) -> bool_t: + """ + Check if we are a view, have a cacher, and are of mixed type. + If so, then force a setitem_copy check. + + Should be called just near setting a value + + Will return a boolean if it we are a view and are cached, but a + single-dtype meaning that the cacher should be updated following + setting. + """ + if self._is_copy: + self._check_setitem_copy(t="referent") + return False + + @final + def _check_setitem_copy(self, t: str = "setting", force: bool_t = False): + """ + + Parameters + ---------- + t : str, the type of setting error + force : bool, default False + If True, then force showing an error. + + validate if we are doing a setitem on a chained copy. + + It is technically possible to figure out that we are setting on + a copy even WITH a multi-dtyped pandas object. In other words, some + blocks may be views while other are not. Currently _is_view will ALWAYS + return False for multi-blocks to avoid having to handle this case. + + df = DataFrame(np.arange(0,9), columns=['count']) + df['group'] = 'b' + + # This technically need not raise SettingWithCopy if both are view + # (which is not generally guaranteed but is usually True. However, + # this is in general not a good practice and we recommend using .loc. + df.iloc[0:5]['group'] = 'a' + + """ + if using_copy_on_write() or warn_copy_on_write(): + return + + # return early if the check is not needed + if not (force or self._is_copy): + return + + value = config.get_option("mode.chained_assignment") + if value is None: + return + + # see if the copy is not actually referred; if so, then dissolve + # the copy weakref + if self._is_copy is not None and not isinstance(self._is_copy, str): + r = self._is_copy() + if not gc.get_referents(r) or (r is not None and r.shape == self.shape): + self._is_copy = None + return + + # a custom message + if isinstance(self._is_copy, str): + t = self._is_copy + + elif t == "referent": + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame\n\n" + "See the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + else: + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value " + "instead\n\nSee the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + if value == "raise": + raise SettingWithCopyError(t) + if value == "warn": + warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level()) + + @final + def __delitem__(self, key) -> None: + """ + Delete item + """ + deleted = False + + maybe_shortcut = False + if self.ndim == 2 and isinstance(self.columns, MultiIndex): + try: + # By using engine's __contains__ we effectively + # restrict to same-length tuples + maybe_shortcut = key not in self.columns._engine + except TypeError: + pass + + if maybe_shortcut: + # Allow shorthand to delete all columns whose first len(key) + # elements match key: + if not isinstance(key, tuple): + key = (key,) + for col in self.columns: + if isinstance(col, tuple) and col[: len(key)] == key: + del self[col] + deleted = True + if not deleted: + # If the above loop ran and didn't delete anything because + # there was no match, this call should raise the appropriate + # exception: + loc = self.axes[-1].get_loc(key) + self._mgr = self._mgr.idelete(loc) + + # delete from the caches + try: + del self._item_cache[key] + except KeyError: + pass + + # ---------------------------------------------------------------------- + # Unsorted + + @final + def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t): + if inplace and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'inplace=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + + @final + def get(self, key, default=None): + """ + Get item from object for given key (ex: DataFrame column). + + Returns default value if not found. + + Parameters + ---------- + key : object + + Returns + ------- + same type as items contained in object + + Examples + -------- + >>> df = pd.DataFrame( + ... [ + ... [24.3, 75.7, "high"], + ... [31, 87.8, "high"], + ... [22, 71.6, "medium"], + ... [35, 95, "medium"], + ... ], + ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"], + ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), + ... ) + + >>> df + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + >>> df.get(["temp_celsius", "windspeed"]) + temp_celsius windspeed + 2014-02-12 24.3 high + 2014-02-13 31.0 high + 2014-02-14 22.0 medium + 2014-02-15 35.0 medium + + >>> ser = df['windspeed'] + >>> ser.get('2014-02-13') + 'high' + + If the key isn't found, the default value will be used. + + >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value") + 'default_value' + + >>> ser.get('2014-02-10', '[unknown]') + '[unknown]' + """ + try: + return self[key] + except (KeyError, ValueError, IndexError): + return default + + @final + @property + def _is_view(self) -> bool_t: + """Return boolean indicating if self is view of another array""" + return self._mgr.is_view + + @final + def reindex_like( + self, + other, + method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None, + copy: bool_t | None = None, + limit: int | None = None, + tolerance=None, + ) -> Self: + """ + Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing NaN in locations having no value + in the previous index. A new object is produced unless the + new index is equivalent to the current one and copy=False. + + Parameters + ---------- + other : Object of the same data type + Its row and column indices are used to define the new indices + of this object. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: propagate last valid observation forward to next + valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + limit : int, default None + Maximum number of consecutive labels to fill for inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations must + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + Series or DataFrame + Same type as caller, but with changed indices on each axis. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + + Notes + ----- + Same as calling + ``.reindex(index=other.index, columns=other.columns,...)``. + + Examples + -------- + >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], + ... [31, 87.8, 'high'], + ... [22, 71.6, 'medium'], + ... [35, 95, 'medium']], + ... columns=['temp_celsius', 'temp_fahrenheit', + ... 'windspeed'], + ... index=pd.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) + + >>> df1 + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + >>> df2 = pd.DataFrame([[28, 'low'], + ... [30, 'low'], + ... [35.1, 'medium']], + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) + + >>> df2 + temp_celsius windspeed + 2014-02-12 28.0 low + 2014-02-13 30.0 low + 2014-02-15 35.1 medium + + >>> df2.reindex_like(df1) + temp_celsius temp_fahrenheit windspeed + 2014-02-12 28.0 NaN low + 2014-02-13 30.0 NaN low + 2014-02-14 NaN NaN NaN + 2014-02-15 35.1 NaN medium + """ + d = other._construct_axes_dict( + axes=self._AXIS_ORDERS, + method=method, + copy=copy, + limit=limit, + tolerance=tolerance, + ) + + return self.reindex(**d) + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: Literal[True], + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: Literal[False] = ..., + errors: IgnoreRaise = ..., + ) -> Self: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: bool_t = ..., + errors: IgnoreRaise = ..., + ) -> Self | None: + ... + + def drop( + self, + labels: IndexLabel | None = None, + *, + axis: Axis = 0, + index: IndexLabel | None = None, + columns: IndexLabel | None = None, + level: Level | None = None, + inplace: bool_t = False, + errors: IgnoreRaise = "raise", + ) -> Self | None: + inplace = validate_bool_kwarg(inplace, "inplace") + + if labels is not None: + if index is not None or columns is not None: + raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") + axis_name = self._get_axis_name(axis) + axes = {axis_name: labels} + elif index is not None or columns is not None: + axes = {"index": index} + if self.ndim == 2: + axes["columns"] = columns + else: + raise ValueError( + "Need to specify at least one of 'labels', 'index' or 'columns'" + ) + + obj = self + + for axis, labels in axes.items(): + if labels is not None: + obj = obj._drop_axis(labels, axis, level=level, errors=errors) + + if inplace: + self._update_inplace(obj) + return None + else: + return obj + + @final + def _drop_axis( + self, + labels, + axis, + level=None, + errors: IgnoreRaise = "raise", + only_slice: bool_t = False, + ) -> Self: + """ + Drop labels from specified axis. Used in the ``drop`` method + internally. + + Parameters + ---------- + labels : single label or list-like + axis : int or axis name + level : int or level name, default None + For MultiIndex + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + only_slice : bool, default False + Whether indexing along columns should be view-only. + + """ + axis_num = self._get_axis_number(axis) + axis = self._get_axis(axis) + + if axis.is_unique: + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError("axis must be a MultiIndex") + new_axis = axis.drop(labels, level=level, errors=errors) + else: + new_axis = axis.drop(labels, errors=errors) + indexer = axis.get_indexer(new_axis) + + # Case for non-unique axis + else: + is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple) + labels = ensure_object(common.index_labels_to_array(labels)) + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError("axis must be a MultiIndex") + mask = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == "raise" and mask.all(): + raise KeyError(f"{labels} not found in axis") + elif ( + isinstance(axis, MultiIndex) + and labels.dtype == "object" + and not is_tuple_labels + ): + # Set level to zero in case of MultiIndex and label is string, + # because isin can't handle strings for MultiIndexes GH#36293 + # In case of tuples we get dtype object but have to use isin GH#42771 + mask = ~axis.get_level_values(0).isin(labels) + else: + mask = ~axis.isin(labels) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == "raise" and labels_missing: + raise KeyError(f"{labels} not found in axis") + + if isinstance(mask.dtype, ExtensionDtype): + # GH#45860 + mask = mask.to_numpy(dtype=bool) + + indexer = mask.nonzero()[0] + new_axis = axis.take(indexer) + + bm_axis = self.ndim - axis_num - 1 + new_mgr = self._mgr.reindex_indexer( + new_axis, + indexer, + axis=bm_axis, + allow_dups=True, + copy=None, + only_slice=only_slice, + ) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + if self.ndim == 1: + result._name = self.name + + return result.__finalize__(self) + + @final + def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: + """ + Replace self internals with result. + + Parameters + ---------- + result : same type as self + verify_is_copy : bool, default True + Provide is_copy checks. + """ + # NOTE: This does *not* call __finalize__ and that's an explicit + # decision that we may revisit in the future. + self._reset_cache() + self._clear_item_cache() + self._mgr = result._mgr + self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True) + + @final + def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: + """ + Prefix labels with string `prefix`. + + For Series, the row labels are prefixed. + For DataFrame, the column labels are prefixed. + + Parameters + ---------- + prefix : str + The string to add before each label. + axis : {0 or 'index', 1 or 'columns', None}, default None + Axis to add prefix on + + .. versionadded:: 2.0.0 + + Returns + ------- + Series or DataFrame + New Series or DataFrame with updated labels. + + See Also + -------- + Series.add_suffix: Suffix row labels with string `suffix`. + DataFrame.add_suffix: Suffix column labels with string `suffix`. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.add_prefix('item_') + item_0 1 + item_1 2 + item_2 3 + item_3 4 + dtype: int64 + + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df + A B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + + >>> df.add_prefix('col_') + col_A col_B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + """ + f = lambda x: f"{prefix}{x}" + + axis_name = self._info_axis_name + if axis is not None: + axis_name = self._get_axis_name(axis) + + mapper = {axis_name: f} + + # error: Incompatible return value type (got "Optional[Self]", + # expected "Self") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + # error: Keywords must be strings + return self._rename(**mapper) # type: ignore[return-value, arg-type, misc] + + @final + def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: + """ + Suffix labels with string `suffix`. + + For Series, the row labels are suffixed. + For DataFrame, the column labels are suffixed. + + Parameters + ---------- + suffix : str + The string to add after each label. + axis : {0 or 'index', 1 or 'columns', None}, default None + Axis to add suffix on + + .. versionadded:: 2.0.0 + + Returns + ------- + Series or DataFrame + New Series or DataFrame with updated labels. + + See Also + -------- + Series.add_prefix: Prefix row labels with string `prefix`. + DataFrame.add_prefix: Prefix column labels with string `prefix`. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.add_suffix('_item') + 0_item 1 + 1_item 2 + 2_item 3 + 3_item 4 + dtype: int64 + + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df + A B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + + >>> df.add_suffix('_col') + A_col B_col + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + """ + f = lambda x: f"{x}{suffix}" + + axis_name = self._info_axis_name + if axis is not None: + axis_name = self._get_axis_name(axis) + + mapper = {axis_name: f} + # error: Incompatible return value type (got "Optional[Self]", + # expected "Self") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + # error: Keywords must be strings + return self._rename(**mapper) # type: ignore[return-value, arg-type, misc] + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool_t = ..., + key: ValueKeyFunc = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool_t = ..., + key: ValueKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: bool_t = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool_t = ..., + key: ValueKeyFunc = ..., + ) -> Self | None: + ... + + def sort_values( + self, + *, + axis: Axis = 0, + ascending: bool_t | Sequence[bool_t] = True, + inplace: bool_t = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + ignore_index: bool_t = False, + key: ValueKeyFunc | None = None, + ) -> Self | None: + """ + Sort by the values along either axis. + + Parameters + ----------%(optional_by)s + axis : %(axes_single_arg)s, default 0 + Axis to be sorted. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of + the by. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the + end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + + Returns + ------- + DataFrame or None + DataFrame with sorted values or None if ``inplace=True``. + + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by the index. + Series.sort_values : Similar method for a Series. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] + ... }) + >>> df + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + Sort by col1 + + >>> df.sort_values(by=['col1']) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D + + Sort by multiple columns + + >>> df.sort_values(by=['col1', 'col2']) + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D + + Sort Descending + + >>> df.sort_values(by='col1', ascending=False) + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 NaN 8 4 D + + Putting NAs first + + >>> df.sort_values(by='col1', ascending=False, na_position='first') + col1 col2 col3 col4 + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + + Sorting with a key function + + >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + Natural sort with the key argument, + using the `natsort ` package. + + >>> df = pd.DataFrame({ + ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], + ... "value": [10, 20, 30, 40, 50] + ... }) + >>> df + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 + """ + raise AbstractMethodError(self) + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> Self: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool_t | Sequence[bool_t] = ..., + inplace: bool_t = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> Self | None: + ... + + def sort_index( + self, + *, + axis: Axis = 0, + level: IndexLabel | None = None, + ascending: bool_t | Sequence[bool_t] = True, + inplace: bool_t = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + sort_remaining: bool_t = True, + ignore_index: bool_t = False, + key: IndexKeyFunc | None = None, + ) -> Self | None: + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + ascending = validate_ascending(ascending) + + target = self._get_axis(axis) + + indexer = get_indexer_indexer( + target, level, ascending, kind, na_position, sort_remaining, key + ) + + if indexer is None: + if inplace: + result = self + else: + result = self.copy(deep=None) + + if ignore_index: + result.index = default_index(len(self)) + if inplace: + return None + else: + return result + + baxis = self._get_block_manager_axis(axis) + new_data = self._mgr.take(indexer, axis=baxis, verify=False) + + # reconstruct axis if needed + if not ignore_index: + new_axis = new_data.axes[baxis]._sort_levels_monotonic() + else: + new_axis = default_index(len(indexer)) + new_data.set_axis(baxis, new_axis) + + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_index") + + @doc( + klass=_shared_doc_kwargs["klass"], + optional_reindex="", + ) + def reindex( + self, + labels=None, + *, + index=None, + columns=None, + axis: Axis | None = None, + method: ReindexMethod | None = None, + copy: bool_t | None = None, + level: Level | None = None, + fill_value: Scalar | None = np.nan, + limit: int | None = None, + tolerance=None, + ) -> Self: + """ + Conform {klass} to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + ``copy=False``. + + Parameters + ---------- + {optional_reindex} + method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next + valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + {klass} with changed index. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + ``DataFrame.reindex`` supports two calling conventions + + * ``(index=index_labels, columns=column_labels, ...)`` + * ``(labels, axis={{'index', 'columns'}}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}}, + ... index=index) + >>> df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default + values in the new index that do not have corresponding + records in the dataframe are assigned ``NaN``. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + >>> df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to + the keyword ``fill_value``. Because the index is not monotonically + increasing or decreasing, we cannot use arguments to the keyword + ``method`` to fill the ``NaN`` values. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value='missing') + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use "axis-style" keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in + ``reindex``, we will create a dataframe with a + monotonically increasing index (for example, a sequence + of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}}, + ... index=date_index) + >>> df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider + date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame + (for example, '2009-12-29') are by default filled with ``NaN``. + If desired, we can fill in the missing values using one of several + options. + + For example, to back-propagate the last valid value to fill the ``NaN`` + values, pass ``bfill`` as an argument to the ``method`` keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the ``NaN`` value present in the original dataframe + (at index value 2010-01-03) will not be filled by any of the + value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and + desired indexes. If you do want to fill in the ``NaN`` values present + in the original dataframe, use the ``fillna()`` method. + + See the :ref:`user guide ` for more. + """ + # TODO: Decide if we care about having different examples for different + # kinds + + if index is not None and columns is not None and labels is not None: + raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.") + elif index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + if labels is not None: + if index is not None: + columns = labels + else: + index = labels + else: + if axis and self._get_axis_number(axis) == 1: + columns = labels + else: + index = labels + axes: dict[Literal["index", "columns"], Any] = { + "index": index, + "columns": columns, + } + method = clean_reindex_fill_method(method) + + # if all axes that are requested to reindex are equal, then only copy + # if indicated must have index names equal here as well as values + if copy and using_copy_on_write(): + copy = False + if all( + self._get_axis(axis_name).identical(ax) + for axis_name, ax in axes.items() + if ax is not None + ): + return self.copy(deep=copy) + + # check if we are a multi reindex + if self._needs_reindex_multi(axes, method, level): + return self._reindex_multi(axes, copy, fill_value) + + # perform the reindex on the axes + return self._reindex_axes( + axes, level, limit, tolerance, method, fill_value, copy + ).__finalize__(self, method="reindex") + + @final + def _reindex_axes( + self, + axes, + level: Level | None, + limit: int | None, + tolerance, + method, + fill_value: Scalar | None, + copy: bool_t | None, + ) -> Self: + """Perform the reindex for all the axes.""" + obj = self + for a in self._AXIS_ORDERS: + labels = axes[a] + if labels is None: + continue + + ax = self._get_axis(a) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, tolerance=tolerance, method=method + ) + + axis = self._get_axis_number(a) + obj = obj._reindex_with_indexers( + {axis: [new_index, indexer]}, + fill_value=fill_value, + copy=copy, + allow_dups=False, + ) + # If we've made a copy once, no need to make another one + copy = False + + return obj + + def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t: + """Check if we do need a multi reindex.""" + return ( + (common.count_not_none(*axes.values()) == self._AXIS_LEN) + and method is None + and level is None + # reindex_multi calls self.values, so we only want to go + # down that path when doing so is cheap. + and self._can_fast_transpose + ) + + def _reindex_multi(self, axes, copy, fill_value): + raise AbstractMethodError(self) + + @final + def _reindex_with_indexers( + self, + reindexers, + fill_value=None, + copy: bool_t | None = False, + allow_dups: bool_t = False, + ) -> Self: + """allow_dups indicates an internal call here""" + # reindex doing multiple operations on different axes if indicated + new_data = self._mgr + for axis in sorted(reindexers.keys()): + index, indexer = reindexers[axis] + baxis = self._get_block_manager_axis(axis) + + if index is None: + continue + + index = ensure_index(index) + if indexer is not None: + indexer = ensure_platform_int(indexer) + + # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi) + new_data = new_data.reindex_indexer( + index, + indexer, + axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy, + ) + # If we've made a copy once, no need to make another one + copy = False + + if ( + (copy or copy is None) + and new_data is self._mgr + and not using_copy_on_write() + ): + new_data = new_data.copy(deep=copy) + elif using_copy_on_write() and new_data is self._mgr: + new_data = new_data.copy(deep=False) + + return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__( + self + ) + + def filter( + self, + items=None, + like: str | None = None, + regex: str | None = None, + axis: Axis | None = None, + ) -> Self: + """ + Subset the dataframe rows or columns according to the specified index labels. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. + + Parameters + ---------- + items : list-like + Keep labels from axis which are in items. + like : str + Keep labels from axis for which "like in label == True". + regex : str (regular expression) + Keep labels from axis for which re.search(regex, label) == True. + axis : {0 or 'index', 1 or 'columns', None}, default None + The axis to filter on, expressed either as an index (int) + or axis name (str). By default this is the info axis, 'columns' for + DataFrame. For `Series` this parameter is unused and defaults to `None`. + + Returns + ------- + same type as input object + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + + Notes + ----- + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. + + Examples + -------- + >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), + ... index=['mouse', 'rabbit'], + ... columns=['one', 'two', 'three']) + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + """ + nkw = common.count_not_none(items, like, regex) + if nkw > 1: + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` " + "are mutually exclusive" + ) + + if axis is None: + axis = self._info_axis_name + labels = self._get_axis(axis) + + if items is not None: + name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) + # error: Keywords must be strings + return self.reindex(**{name: items}) # type: ignore[misc] + elif like: + + def f(x) -> bool_t: + assert like is not None # needed for mypy + return like in ensure_str(x) + + values = labels.map(f) + return self.loc(axis=axis)[values] + elif regex: + + def f(x) -> bool_t: + return matcher.search(ensure_str(x)) is not None + + matcher = re.compile(regex) + values = labels.map(f) + return self.loc(axis=axis)[values] + else: + raise TypeError("Must pass either `items`, `like`, or `regex`") + + @final + def head(self, n: int = 5) -> Self: + """ + Return the first `n` rows. + + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + + For negative values of `n`, this function returns all rows except + the last `|n|` rows, equivalent to ``df[:n]``. + + If n is larger than the number of rows, this function returns all rows. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + same type as caller + The first `n` rows of the caller object. + + See Also + -------- + DataFrame.tail: Returns the last `n` rows. + + Examples + -------- + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the first 5 lines + + >>> df.head() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + + Viewing the first `n` lines (three in this case) + + >>> df.head(3) + animal + 0 alligator + 1 bee + 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + """ + if using_copy_on_write(): + return self.iloc[:n].copy() + return self.iloc[:n] + + @final + def tail(self, n: int = 5) -> Self: + """ + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. + + For negative values of `n`, this function returns all rows except + the first `|n|` rows, equivalent to ``df[|n|:]``. + + If n is larger than the number of rows, this function returns all rows. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + type of caller + The last `n` rows of the caller object. + + See Also + -------- + DataFrame.head : The first `n` rows of the caller object. + + Examples + -------- + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last 5 lines + + >>> df.tail() + animal + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last `n` lines (three in this case) + + >>> df.tail(3) + animal + 6 shark + 7 whale + 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + """ + if using_copy_on_write(): + if n == 0: + return self.iloc[0:0].copy() + return self.iloc[-n:].copy() + if n == 0: + return self.iloc[0:0] + return self.iloc[-n:] + + @final + def sample( + self, + n: int | None = None, + frac: float | None = None, + replace: bool_t = False, + weights=None, + random_state: RandomState | None = None, + axis: Axis | None = None, + ignore_index: bool_t = False, + ) -> Self: + """ + Return a random sample of items from an axis of object. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length as axis + being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + Infinite values not allowed. + random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional + If int, array-like, or BitGenerator, seed for random number generator. + If np.random.RandomState or np.random.Generator, use as given. + + .. versionchanged:: 1.4.0 + + np.random.Generator objects now accepted + + axis : {0 or 'index', 1 or 'columns', None}, default None + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type. For `Series` this parameter is unused and defaults to `None`. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing `n` items randomly + sampled from the caller object. + + See Also + -------- + DataFrameGroupBy.sample: Generates random samples from each group of a + DataFrame object. + SeriesGroupBy.sample: Generates random samples from each group of a + Series object. + numpy.random.choice: Generates a random sample from a given 1-D numpy + array. + + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], + ... 'num_wings': [2, 0, 0, 0], + ... 'num_specimen_seen': [10, 2, 1, 8]}, + ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df + num_legs num_wings num_specimen_seen + falcon 2 2 10 + dog 4 0 2 + spider 8 0 1 + fish 0 0 8 + + Extract 3 random elements from the ``Series`` ``df['num_legs']``: + Note that we use `random_state` to ensure the reproducibility of + the examples. + + >>> df['num_legs'].sample(n=3, random_state=1) + fish 0 + spider 8 + falcon 2 + Name: num_legs, dtype: int64 + + A random 50% sample of the ``DataFrame`` with replacement: + + >>> df.sample(frac=0.5, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + + An upsample sample of the ``DataFrame`` with replacement: + Note that `replace` parameter has to be `True` for `frac` parameter > 1. + + >>> df.sample(frac=2, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + falcon 2 2 10 + falcon 2 2 10 + fish 0 0 8 + dog 4 0 2 + fish 0 0 8 + dog 4 0 2 + + Using a DataFrame column as weights. Rows with larger value in the + `num_specimen_seen` column are more likely to be sampled. + + >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) + num_legs num_wings num_specimen_seen + falcon 2 2 10 + fish 0 0 8 + """ # noqa: E501 + if axis is None: + axis = 0 + + axis = self._get_axis_number(axis) + obj_len = self.shape[axis] + + # Process random_state argument + rs = common.random_state(random_state) + + size = sample.process_sampling_size(n, frac, replace) + if size is None: + assert frac is not None + size = round(frac * obj_len) + + if weights is not None: + weights = sample.preprocess_weights(self, weights, axis) + + sampled_indices = sample.sample(obj_len, size, replace, weights, rs) + result = self.take(sampled_indices, axis=axis) + + if ignore_index: + result.index = default_index(len(result)) + + return result + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + r""" + Apply chainable functions that expect Series or DataFrames. + + Parameters + ---------- + func : function + Function to apply to the {klass}. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the {klass}. + *args : iterable, optional + Positional arguments passed into ``func``. + **kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + the return type of ``func``. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.map : Apply a function elementwise on a whole DataFrame. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. + + Notes + ----- + Use ``.pipe`` when chaining together functions that expect + Series, DataFrames or GroupBy objects. + + Examples + -------- + Constructing a income DataFrame from a dictionary. + + >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] + >>> df = pd.DataFrame(data, columns=['Salary', 'Others']) + >>> df + Salary Others + 0 8000 1000.0 + 1 9500 NaN + 2 5000 2000.0 + + Functions that perform tax reductions on an income DataFrame. + + >>> def subtract_federal_tax(df): + ... return df * 0.9 + >>> def subtract_state_tax(df, rate): + ... return df * (1 - rate) + >>> def subtract_national_insurance(df, rate, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + + Instead of writing + + >>> subtract_national_insurance( + ... subtract_state_tax(subtract_federal_tax(df), rate=0.12), + ... rate=0.05, + ... rate_increase=0.02) # doctest: +SKIP + + You can write + + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 NaN + 2 3682.80 1473.12 + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``national_insurance`` takes its data as ``df`` + in the second argument: + + >>> def subtract_national_insurance(rate, df, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe( + ... (subtract_national_insurance, 'df'), + ... rate=0.05, + ... rate_increase=0.02 + ... ) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 NaN + 2 3682.80 1473.12 + """ + if using_copy_on_write(): + return common.pipe(self.copy(deep=None), func, *args, **kwargs) + return common.pipe(self, func, *args, **kwargs) + + # ---------------------------------------------------------------------- + # Attribute access + + @final + def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: + """ + Propagate metadata from other to self. + + Parameters + ---------- + other : the object from which to get the attributes that we are going + to propagate + method : str, optional + A passed method name providing context on where ``__finalize__`` + was called. + + .. warning:: + + The value passed as `method` are not currently considered + stable across pandas releases. + """ + if isinstance(other, NDFrame): + if other.attrs: + # We want attrs propagation to have minimal performance + # impact if attrs are not used; i.e. attrs is an empty dict. + # One could make the deepcopy unconditionally, but a deepcopy + # of an empty dict is 50x more expensive than the empty check. + self.attrs = deepcopy(other.attrs) + + self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels + # For subclasses using _metadata. + for name in set(self._metadata) & set(other._metadata): + assert isinstance(name, str) + object.__setattr__(self, name, getattr(other, name, None)) + + if method == "concat": + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = deepcopy(attrs) + + allows_duplicate_labels = all( + x.flags.allows_duplicate_labels for x in other.objs + ) + self.flags.allows_duplicate_labels = allows_duplicate_labels + + return self + + @final + def __getattr__(self, name: str): + """ + After regular attribute access, try looking up the name + This allows simpler access to columns for interactive use. + """ + # Note: obj.x will always call obj.__getattribute__('x') prior to + # calling obj.__getattr__('x'). + if ( + name not in self._internal_names_set + and name not in self._metadata + and name not in self._accessors + and self._info_axis._can_hold_identifiers_and_holds_name(name) + ): + return self[name] + return object.__getattribute__(self, name) + + @final + def __setattr__(self, name: str, value) -> None: + """ + After regular attribute access, try setting the name + This allows simpler access to columns for interactive use. + """ + # first try regular attribute access via __getattribute__, so that + # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify + # the same attribute. + + try: + object.__getattribute__(self, name) + return object.__setattr__(self, name, value) + except AttributeError: + pass + + # if this fails, go on to more involved attribute setting + # (note that this matches __getattr__, above). + if name in self._internal_names_set: + object.__setattr__(self, name, value) + elif name in self._metadata: + object.__setattr__(self, name, value) + else: + try: + existing = getattr(self, name) + if isinstance(existing, Index): + object.__setattr__(self, name, value) + elif name in self._info_axis: + self[name] = value + else: + object.__setattr__(self, name, value) + except (AttributeError, TypeError): + if isinstance(self, ABCDataFrame) and (is_list_like(value)): + warnings.warn( + "Pandas doesn't allow columns to be " + "created via a new attribute name - see " + "https://pandas.pydata.org/pandas-docs/" + "stable/indexing.html#attribute-access", + stacklevel=find_stack_level(), + ) + object.__setattr__(self, name, value) + + @final + def _dir_additions(self) -> set[str]: + """ + add the string-like attributes from the info_axis. + If info_axis is a MultiIndex, its first level values are used. + """ + additions = super()._dir_additions() + if self._info_axis._can_hold_strings: + additions.update(self._info_axis._dir_additions_for_owner) + return additions + + # ---------------------------------------------------------------------- + # Consolidation of internals + + @final + def _protect_consolidate(self, f): + """ + Consolidate _mgr -- if the blocks have changed, then clear the + cache + """ + if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): + return f() + blocks_before = len(self._mgr.blocks) + result = f() + if len(self._mgr.blocks) != blocks_before: + self._clear_item_cache() + return result + + @final + def _consolidate_inplace(self) -> None: + """Consolidate data in place and return None""" + + def f() -> None: + self._mgr = self._mgr.consolidate() + + self._protect_consolidate(f) + + @final + def _consolidate(self): + """ + Compute NDFrame with "consolidated" internals (data of each dtype + grouped together in a single ndarray). + + Returns + ------- + consolidated : same type as caller + """ + f = lambda: self._mgr.consolidate() + cons_data = self._protect_consolidate(f) + return self._constructor_from_mgr(cons_data, axes=cons_data.axes).__finalize__( + self + ) + + @final + @property + def _is_mixed_type(self) -> bool_t: + if self._mgr.is_single_block: + # Includes all Series cases + return False + + if self._mgr.any_extension_types: + # Even if they have the same dtype, we can't consolidate them, + # so we pretend this is "mixed'" + return True + + return self.dtypes.nunique() > 1 + + @final + def _get_numeric_data(self) -> Self: + new_mgr = self._mgr.get_numeric_data() + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + + @final + def _get_bool_data(self): + new_mgr = self._mgr.get_bool_data() + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + + # ---------------------------------------------------------------------- + # Internal Interface Methods + + @property + def values(self): + raise AbstractMethodError(self) + + @property + def _values(self) -> ArrayLike: + """internal implementation""" + raise AbstractMethodError(self) + + @property + def dtypes(self): + """ + Return the dtypes in the DataFrame. + + This returns a Series with the data type of each column. + The result's index is the original DataFrame's columns. Columns + with mixed types are stored with the ``object`` dtype. See + :ref:`the User Guide ` for more. + + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> df = pd.DataFrame({'float': [1.0], + ... 'int': [1], + ... 'datetime': [pd.Timestamp('20180310')], + ... 'string': ['foo']}) + >>> df.dtypes + float float64 + int int64 + datetime datetime64[ns] + string object + dtype: object + """ + data = self._mgr.get_dtypes() + return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) + + @final + def astype( + self, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise" + ) -> Self: + """ + Cast a pandas object to a specified dtype ``dtype``. + + Parameters + ---------- + dtype : str, data type, Series or Mapping of column name -> data type + Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to + cast entire pandas object to the same type. Alternatively, use a + mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is + a numpy.dtype or Python type to cast one or more of the DataFrame's + columns to column-specific types. + copy : bool, default True + Return a copy when ``copy=True`` (be very careful setting + ``copy=False`` as changes to values then may propagate to other + pandas objects). + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + errors : {'raise', 'ignore'}, default 'raise' + Control raising of exceptions on invalid data for provided dtype. + + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object. + + Returns + ------- + same type as caller + + See Also + -------- + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + numpy.ndarray.astype : Cast a numpy array to a specified type. + + Notes + ----- + .. versionchanged:: 2.0.0 + + Using ``astype`` to convert from timezone-naive dtype to + timezone-aware dtype will raise an exception. + Use :meth:`Series.dt.tz_localize` instead. + + Examples + -------- + Create a DataFrame: + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + Cast all columns to int32: + + >>> df.astype('int32').dtypes + col1 int32 + col2 int32 + dtype: object + + Cast col1 to int32 using a dictionary: + + >>> df.astype({'col1': 'int32'}).dtypes + col1 int32 + col2 int64 + dtype: object + + Create a series: + + >>> ser = pd.Series([1, 2], dtype='int32') + >>> ser + 0 1 + 1 2 + dtype: int32 + >>> ser.astype('int64') + 0 1 + 1 2 + dtype: int64 + + Convert to categorical type: + + >>> ser.astype('category') + 0 1 + 1 2 + dtype: category + Categories (2, int32): [1, 2] + + Convert to ordered categorical type with custom ordering: + + >>> from pandas.api.types import CategoricalDtype + >>> cat_dtype = CategoricalDtype( + ... categories=[2, 1], ordered=True) + >>> ser.astype(cat_dtype) + 0 1 + 1 2 + dtype: category + Categories (2, int64): [2 < 1] + + Create a series of dates: + + >>> ser_date = pd.Series(pd.date_range('20200101', periods=3)) + >>> ser_date + 0 2020-01-01 + 1 2020-01-02 + 2 2020-01-03 + dtype: datetime64[ns] + """ + if copy and using_copy_on_write(): + copy = False + + if is_dict_like(dtype): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or self.name not in dtype: + raise KeyError( + "Only the Series name can be used for " + "the key in Series dtype mappings." + ) + new_type = dtype[self.name] + return self.astype(new_type, copy, errors) + + # GH#44417 cast to Series so we can use .iat below, which will be + # robust in case we + from pandas import Series + + dtype_ser = Series(dtype, dtype=object) + + for col_name in dtype_ser.index: + if col_name not in self: + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument. " + f"'{col_name}' not found in columns." + ) + + dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False) + + results = [] + for i, (col_name, col) in enumerate(self.items()): + cdt = dtype_ser.iat[i] + if isna(cdt): + res_col = col.copy(deep=copy) + else: + try: + res_col = col.astype(dtype=cdt, copy=copy, errors=errors) + except ValueError as ex: + ex.args = ( + f"{ex}: Error while type casting for column '{col_name}'", + ) + raise + results.append(res_col) + + elif is_extension_array_dtype(dtype) and self.ndim > 1: + # TODO(EA2D): special case not needed with 2D EAs + dtype = pandas_dtype(dtype) + if isinstance(dtype, ExtensionDtype) and all( + arr.dtype == dtype for arr in self._mgr.arrays + ): + return self.copy(deep=copy) + # GH 18099/22869: columnwise conversion to extension dtype + # GH 24704: self.items handles duplicate column names + results = [ + ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items() + ] + + else: + # else, only a single dtype is given + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) + res = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res.__finalize__(self, method="astype") + + # GH 33113: handle empty frame or series + if not results: + return self.copy(deep=None) + + # GH 19920: retain column metadata after concat + result = concat(results, axis=1, copy=False) + # GH#40810 retain subclass + # error: Incompatible types in assignment + # (expression has type "Self", variable has type "DataFrame") + result = self._constructor(result) # type: ignore[assignment] + result.columns = self.columns + result = result.__finalize__(self, method="astype") + # https://github.com/python/mypy/issues/8354 + return cast(Self, result) + + @final + def copy(self, deep: bool_t | None = True) -> Self: + """ + Make a copy of this object's indices and data. + + When ``deep=True`` (default), a new object will be created with a + copy of the calling object's data and indices. Modifications to + the data or indices of the copy will not be reflected in the + original object (see notes below). + + When ``deep=False``, a new object will be created without copying + the calling object's data or index (only references to the data + and index are copied). Any changes to the data of the original + will be reflected in the shallow copy (and vice versa). + + .. note:: + The ``deep=False`` behaviour as described above will change + in pandas 3.0. `Copy-on-Write + `__ + will be enabled by default, which means that the "shallow" copy + is that is returned with ``deep=False`` will still avoid making + an eager copy, but changes to the data of the original will *no* + longer be reflected in the shallow copy (or vice versa). Instead, + it makes use of a lazy (deferred) copy mechanism that will copy + the data only when any changes to the original or shallow copy is + made. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Parameters + ---------- + deep : bool, default True + Make a deep copy, including a copy of the data and the indices. + With ``deep=False`` neither the indices nor the data are copied. + + Returns + ------- + Series or DataFrame + Object type matches caller. + + Notes + ----- + When ``deep=True``, data is copied but actual Python objects + will not be copied recursively, only the reference to the object. + This is in contrast to `copy.deepcopy` in the Standard Library, + which recursively copies object data (see examples below). + + While ``Index`` objects are copied when ``deep=True``, the underlying + numpy array is not copied for performance reasons. Since ``Index`` is + immutable, the underlying data can be safely shared and a copy + is not needed. + + Since pandas is not thread safe, see the + :ref:`gotchas ` when copying in a threading + environment. + + When ``copy_on_write`` in pandas config is set to ``True``, the + ``copy_on_write`` config takes effect even when ``deep=False``. + This means that any changes to the copied data would make a new copy + of the data upon write (and vice versa). Changes made to either the + original or copied variable would not be reflected in the counterpart. + See :ref:`Copy_on_Write ` for more information. + + Examples + -------- + >>> s = pd.Series([1, 2], index=["a", "b"]) + >>> s + a 1 + b 2 + dtype: int64 + + >>> s_copy = s.copy() + >>> s_copy + a 1 + b 2 + dtype: int64 + + **Shallow copy versus default (deep) copy:** + + >>> s = pd.Series([1, 2], index=["a", "b"]) + >>> deep = s.copy() + >>> shallow = s.copy(deep=False) + + Shallow copy shares data and index with original. + + >>> s is shallow + False + >>> s.values is shallow.values and s.index is shallow.index + True + + Deep copy has own copy of data and index. + + >>> s is deep + False + >>> s.values is deep.values or s.index is deep.index + False + + Updates to the data shared by shallow copy and original is reflected + in both (NOTE: this will no longer be true for pandas >= 3.0); + deep copy remains unchanged. + + >>> s.iloc[0] = 3 + >>> shallow.iloc[1] = 4 + >>> s + a 3 + b 4 + dtype: int64 + >>> shallow + a 3 + b 4 + dtype: int64 + >>> deep + a 1 + b 2 + dtype: int64 + + Note that when copying an object containing Python objects, a deep copy + will copy the data, but will not do so recursively. Updating a nested + data object will be reflected in the deep copy. + + >>> s = pd.Series([[1, 2], [3, 4]]) + >>> deep = s.copy() + >>> s[0][0] = 10 + >>> s + 0 [10, 2] + 1 [3, 4] + dtype: object + >>> deep + 0 [10, 2] + 1 [3, 4] + dtype: object + + **Copy-on-Write is set to true**, the shallow copy is not modified + when the original data is changed: + + >>> with pd.option_context("mode.copy_on_write", True): + ... s = pd.Series([1, 2], index=["a", "b"]) + ... copy = s.copy(deep=False) + ... s.iloc[0] = 100 + ... s + a 100 + b 2 + dtype: int64 + >>> copy + a 1 + b 2 + dtype: int64 + """ + data = self._mgr.copy(deep=deep) + self._clear_item_cache() + return self._constructor_from_mgr(data, axes=data.axes).__finalize__( + self, method="copy" + ) + + @final + def __copy__(self, deep: bool_t = True) -> Self: + return self.copy(deep=deep) + + @final + def __deepcopy__(self, memo=None) -> Self: + """ + Parameters + ---------- + memo, default None + Standard signature. Unused + """ + return self.copy(deep=True) + + @final + def infer_objects(self, copy: bool_t | None = None) -> Self: + """ + Attempt to infer better dtypes for object columns. + + Attempts soft conversion of object-dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + Parameters + ---------- + copy : bool, default True + Whether to make a copy for non-object or non-inferable columns + or Series. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + same type as input object + + See Also + -------- + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to numeric type. + convert_dtypes : Convert argument to best possible dtype. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + + >>> df.dtypes + A object + dtype: object + + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + new_mgr = self._mgr.convert(copy=copy) + res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return res.__finalize__(self, method="infer_objects") + + @final + def convert_dtypes( + self, + infer_objects: bool_t = True, + convert_string: bool_t = True, + convert_integer: bool_t = True, + convert_boolean: bool_t = True, + convert_floating: bool_t = True, + dtype_backend: DtypeBackend = "numpy_nullable", + ) -> Self: + """ + Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``. + + Parameters + ---------- + infer_objects : bool, default True + Whether object dtypes should be converted to the best possible types. + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + Series or DataFrame + Copy of input object with new dtype. + + See Also + -------- + infer_objects : Infer dtypes of objects. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + + Notes + ----- + By default, ``convert_dtypes`` will attempt to convert a Series (or each + Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options + ``convert_string``, ``convert_integer``, ``convert_boolean`` and + ``convert_floating``, it is possible to turn off individual conversions + to ``StringDtype``, the integer extension types, ``BooleanDtype`` + or floating extension types, respectively. + + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference + rules as during normal Series/DataFrame construction. Then, if possible, + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer + or floating extension type, otherwise leave as ``object``. + + If the dtype is integer, convert to an appropriate integer extension type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer extension type. Otherwise, convert to an + appropriate floating extension type. + + In the future, as new dtypes are added that support ``pd.NA``, the results + of this method will change to support those new dtypes. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), + ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), + ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), + ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + ... } + ... ) + + Start with a DataFrame with default dtypes. + + >>> df + a b c d e f + 0 1 x True h 10.0 NaN + 1 2 y False i NaN 100.5 + 2 3 z NaN NaN 20.0 200.0 + + >>> df.dtypes + a int32 + b object + c object + d object + e float64 + f float64 + dtype: object + + Convert the DataFrame to use best possible dtypes. + + >>> dfn = df.convert_dtypes() + >>> dfn + a b c d e f + 0 1 x True h 10 + 1 2 y False i 100.5 + 2 3 z 20 200.0 + + >>> dfn.dtypes + a Int32 + b string[python] + c boolean + d string[python] + e Int64 + f Float64 + dtype: object + + Start with a Series of strings and missing data represented by ``np.nan``. + + >>> s = pd.Series(["a", "b", np.nan]) + >>> s + 0 a + 1 b + 2 NaN + dtype: object + + Obtain a Series with dtype ``StringDtype``. + + >>> s.convert_dtypes() + 0 a + 1 b + 2 + dtype: string + """ + check_dtype_backend(dtype_backend) + new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr] + infer_objects=infer_objects, + convert_string=convert_string, + convert_integer=convert_integer, + convert_boolean=convert_boolean, + convert_floating=convert_floating, + dtype_backend=dtype_backend, + ) + res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return res.__finalize__(self, method="convert_dtypes") + + # ---------------------------------------------------------------------- + # Filling NA's + + def _deprecate_downcast(self, downcast, method_name: str): + # GH#40988 + if downcast is not lib.no_default: + warnings.warn( + f"The 'downcast' keyword in {method_name} is deprecated and " + "will be removed in a future version. Use " + "res.infer_objects(copy=False) to infer non-object dtype, or " + "pd.to_numeric with the 'downcast' keyword to downcast numeric " + "results.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + downcast = None + return downcast + + @final + def _pad_or_backfill( + self, + method: Literal["ffill", "bfill", "pad", "backfill"], + *, + axis: None | Axis = None, + inplace: bool_t = False, + limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: dict | None = None, + ): + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + method = clean_fill_method(method) + + if not self._mgr.is_single_block and axis == 1: + # e.g. test_align_fill_method + # TODO(3.0): once downcast is removed, we can do the .T + # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill. + if inplace: + raise NotImplementedError() + result = self.T._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area + ).T + + return result + + new_mgr = self._mgr.pad_or_backfill( + method=method, + axis=self._get_block_manager_axis(axis), + limit=limit, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + ) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="fillna") + + @overload + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame = ..., + *, + method: FillnaOptions | None = ..., + axis: Axis | None = ..., + inplace: Literal[False] = ..., + limit: int | None = ..., + downcast: dict | None = ..., + ) -> Self: + ... + + @overload + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame = ..., + *, + method: FillnaOptions | None = ..., + axis: Axis | None = ..., + inplace: Literal[True], + limit: int | None = ..., + downcast: dict | None = ..., + ) -> None: + ... + + @overload + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame = ..., + *, + method: FillnaOptions | None = ..., + axis: Axis | None = ..., + inplace: bool_t = ..., + limit: int | None = ..., + downcast: dict | None = ..., + ) -> Self | None: + ... + + @final + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame | None = None, + *, + method: FillnaOptions | None = None, + axis: Axis | None = None, + inplace: bool_t = False, + limit: int | None = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Self | None: + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. + method : {{'backfill', 'bfill', 'ffill', None}}, default None + Method to use for filling holes in reindexed Series: + + * ffill: propagate last valid observation forward to next valid. + * backfill / bfill: use next valid observation to fill gap. + + .. deprecated:: 2.1.0 + Use ffill or bfill instead. + + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.2.0 + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + + See Also + -------- + ffill : Fill values by propagating the last valid observation to next valid. + bfill : Fill values by using the next valid observation to fill the gap. + interpolate : Fill NaN values using interpolation. + reindex : Conform object to new index. + asfreq : Convert TimeSeries to specified frequency. + + Examples + -------- + >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4]], + ... columns=list("ABCD")) + >>> df + A B C D + 0 NaN 2.0 NaN 0.0 + 1 3.0 4.0 NaN 1.0 + 2 NaN NaN NaN NaN + 3 NaN 3.0 NaN 4.0 + + Replace all NaN elements with 0s. + + >>> df.fillna(0) + A B C D + 0 0.0 2.0 0.0 0.0 + 1 3.0 4.0 0.0 1.0 + 2 0.0 0.0 0.0 0.0 + 3 0.0 3.0 0.0 4.0 + + Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, + 2, and 3 respectively. + + >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}} + >>> df.fillna(value=values) + A B C D + 0 0.0 2.0 2.0 0.0 + 1 3.0 4.0 2.0 1.0 + 2 0.0 1.0 2.0 3.0 + 3 0.0 3.0 2.0 4.0 + + Only replace the first NaN element. + + >>> df.fillna(value=values, limit=1) + A B C D + 0 0.0 2.0 2.0 0.0 + 1 3.0 4.0 NaN 1.0 + 2 NaN 1.0 NaN 3.0 + 3 NaN 3.0 NaN 4.0 + + When filling using a DataFrame, replacement happens along + the same column names and same indices + + >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE")) + >>> df.fillna(df2) + A B C D + 0 0.0 2.0 0.0 0.0 + 1 3.0 4.0 0.0 1.0 + 2 0.0 0.0 0.0 NaN + 3 0.0 3.0 0.0 4.0 + + Note that column D is not affected since it is not present in df2. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + value, method = validate_fillna_kwargs(value, method) + if method is not None: + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is deprecated and " + "will raise in a future version. Use obj.ffill() or obj.bfill() " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + was_no_default = downcast is lib.no_default + downcast = self._deprecate_downcast(downcast, "fillna") + + # set the default here, so functions examining the signaure + # can detect if something was set (e.g. in groupby) (GH9221) + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + + if value is None: + return self._pad_or_backfill( + # error: Argument 1 to "_pad_or_backfill" of "NDFrame" has + # incompatible type "Optional[Literal['backfill', 'bfill', 'ffill', + # 'pad']]"; expected "Literal['ffill', 'bfill', 'pad', 'backfill']" + method, # type: ignore[arg-type] + axis=axis, + limit=limit, + inplace=inplace, + # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" + # has incompatible type "Union[Dict[Any, Any], None, + # Literal[_NoDefault.no_default]]"; expected + # "Optional[Dict[Any, Any]]" + downcast=downcast, # type: ignore[arg-type] + ) + else: + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + if not len(value): + # test_fillna_nonscalar + if inplace: + return None + return self.copy(deep=None) + from pandas import Series + + value = Series(value) + value = value.reindex(self.index, copy=False) + value = value._values + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' + ) + + new_data = self._mgr.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) + + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill " + "with dict/Series column " + "by column" + ) + if using_copy_on_write(): + result = self.copy(deep=None) + else: + result = self if inplace else self.copy() + is_dict = isinstance(downcast, dict) + for k, v in value.items(): + if k not in result: + continue + + if was_no_default: + downcast_k = lib.no_default + else: + downcast_k = ( + # error: Incompatible types in assignment (expression + # has type "Union[Dict[Any, Any], None, + # Literal[_NoDefault.no_default], Any]", variable has + # type "_NoDefault") + downcast # type: ignore[assignment] + if not is_dict + # error: Item "None" of "Optional[Dict[Any, Any]]" has + # no attribute "get" + else downcast.get(k) # type: ignore[union-attr] + ) + + res_k = result[k].fillna(v, limit=limit, downcast=downcast_k) + + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k + else: + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = np.arange(self.shape[1])[locs] + elif ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "b" + ): + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc + else: + result.isetitem(loc, res_loc) + if inplace: + return self._update_inplace(result) + else: + return result + + elif not is_list_like(value): + if axis == 1: + result = self.T.fillna(value=value, limit=limit).T + new_data = result._mgr + else: + new_data = self._mgr.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value)._mgr + else: + raise ValueError(f"invalid fill value with a {type(value)}") + + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="fillna") + + @overload + def ffill( + self, + *, + axis: None | Axis = ..., + inplace: Literal[False] = ..., + limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> Self: + ... + + @overload + def ffill( + self, + *, + axis: None | Axis = ..., + inplace: Literal[True], + limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> None: + ... + + @overload + def ffill( + self, + *, + axis: None | Axis = ..., + inplace: bool_t = ..., + limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> Self | None: + ... + + @final + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) + def ffill( + self, + *, + axis: None | Axis = None, + inplace: bool_t = False, + limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Self | None: + """ + Fill NA/NaN values by propagating the last valid observation to next valid. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + .. versionadded:: 2.2.0 + + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.2.0 + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + + Examples + -------- + >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4]], + ... columns=list("ABCD")) + >>> df + A B C D + 0 NaN 2.0 NaN 0.0 + 1 3.0 4.0 NaN 1.0 + 2 NaN NaN NaN NaN + 3 NaN 3.0 NaN 4.0 + + >>> df.ffill() + A B C D + 0 NaN 2.0 NaN 0.0 + 1 3.0 4.0 NaN 1.0 + 2 3.0 4.0 NaN 1.0 + 3 3.0 3.0 NaN 4.0 + + >>> ser = pd.Series([1, np.nan, 2, 3]) + >>> ser.ffill() + 0 1.0 + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + """ + downcast = self._deprecate_downcast(downcast, "ffill") + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + return self._pad_or_backfill( + "ffill", + axis=axis, + inplace=inplace, + limit=limit, + limit_area=limit_area, + # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" + # has incompatible type "Union[Dict[Any, Any], None, + # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" + downcast=downcast, # type: ignore[arg-type] + ) + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def pad( + self, + *, + axis: None | Axis = None, + inplace: bool_t = False, + limit: None | int = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Self | None: + """ + Fill NA/NaN values by propagating the last valid observation to next valid. + + .. deprecated:: 2.0 + + {klass}.pad is deprecated. Use {klass}.ffill instead. + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + + Examples + -------- + Please see examples for :meth:`DataFrame.ffill` or :meth:`Series.ffill`. + """ + warnings.warn( + "DataFrame.pad/Series.pad is deprecated. Use " + "DataFrame.ffill/Series.ffill instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) + + @overload + def bfill( + self, + *, + axis: None | Axis = ..., + inplace: Literal[False] = ..., + limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> Self: + ... + + @overload + def bfill( + self, + *, + axis: None | Axis = ..., + inplace: Literal[True], + limit: None | int = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> None: + ... + + @overload + def bfill( + self, + *, + axis: None | Axis = ..., + inplace: bool_t = ..., + limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: dict | None | lib.NoDefault = ..., + ) -> Self | None: + ... + + @final + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) + def bfill( + self, + *, + axis: None | Axis = None, + inplace: bool_t = False, + limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Self | None: + """ + Fill NA/NaN values by using the next valid observation to fill the gap. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + .. versionadded:: 2.2.0 + + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.2.0 + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + + Examples + -------- + For Series: + + >>> s = pd.Series([1, None, None, 2]) + >>> s.bfill() + 0 1.0 + 1 2.0 + 2 2.0 + 3 2.0 + dtype: float64 + >>> s.bfill(limit=1) + 0 1.0 + 1 NaN + 2 2.0 + 3 2.0 + dtype: float64 + + With DataFrame: + + >>> df = pd.DataFrame({{'A': [1, None, None, 4], 'B': [None, 5, None, 7]}}) + >>> df + A B + 0 1.0 NaN + 1 NaN 5.0 + 2 NaN NaN + 3 4.0 7.0 + >>> df.bfill() + A B + 0 1.0 5.0 + 1 4.0 5.0 + 2 4.0 7.0 + 3 4.0 7.0 + >>> df.bfill(limit=1) + A B + 0 1.0 5.0 + 1 NaN 5.0 + 2 4.0 7.0 + 3 4.0 7.0 + """ + downcast = self._deprecate_downcast(downcast, "bfill") + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + return self._pad_or_backfill( + "bfill", + axis=axis, + inplace=inplace, + limit=limit, + limit_area=limit_area, + # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" + # has incompatible type "Union[Dict[Any, Any], None, + # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" + downcast=downcast, # type: ignore[arg-type] + ) + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def backfill( + self, + *, + axis: None | Axis = None, + inplace: bool_t = False, + limit: None | int = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Self | None: + """ + Fill NA/NaN values by using the next valid observation to fill the gap. + + .. deprecated:: 2.0 + + {klass}.backfill is deprecated. Use {klass}.bfill instead. + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + + Examples + -------- + Please see examples for :meth:`DataFrame.bfill` or :meth:`Series.bfill`. + """ + warnings.warn( + "DataFrame.backfill/Series.backfill is deprecated. Use " + "DataFrame.bfill/Series.bfill instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) + + @overload + def replace( + self, + to_replace=..., + value=..., + *, + inplace: Literal[False] = ..., + limit: int | None = ..., + regex: bool_t = ..., + method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., + ) -> Self: + ... + + @overload + def replace( + self, + to_replace=..., + value=..., + *, + inplace: Literal[True], + limit: int | None = ..., + regex: bool_t = ..., + method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., + ) -> None: + ... + + @overload + def replace( + self, + to_replace=..., + value=..., + *, + inplace: bool_t = ..., + limit: int | None = ..., + regex: bool_t = ..., + method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., + ) -> Self | None: + ... + + @final + @doc( + _shared_docs["replace"], + klass=_shared_doc_kwargs["klass"], + inplace=_shared_doc_kwargs["inplace"], + ) + def replace( + self, + to_replace=None, + value=lib.no_default, + *, + inplace: bool_t = False, + limit: int | None = None, + regex: bool_t = False, + method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default, + ) -> Self | None: + if method is not lib.no_default: + warnings.warn( + # GH#33302 + f"The 'method' keyword in {type(self).__name__}.replace is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif limit is not None: + warnings.warn( + # GH#33302 + f"The 'limit' keyword in {type(self).__name__}.replace is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if ( + value is lib.no_default + and method is lib.no_default + and not is_dict_like(to_replace) + and regex is False + ): + # case that goes through _replace_single and defaults to method="pad" + warnings.warn( + # GH#33302 + f"{type(self).__name__}.replace without 'value' and with " + "non-dict-like 'to_replace' is deprecated " + "and will raise in a future version. " + "Explicitly specify the new values instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if not ( + is_scalar(to_replace) + or is_re_compilable(to_replace) + or is_list_like(to_replace) + ): + raise TypeError( + "Expecting 'to_replace' to be either a scalar, array-like, " + "dict or None, got invalid type " + f"{repr(type(to_replace).__name__)}" + ) + + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # in non-CoW mode, chained Series access will populate the + # `_item_cache` which results in an increased ref count not below + # the threshold, while we still need to warn. We detect this case + # of a Series derived from a DataFrame through the presence of + # checking the `_cacher` + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + if not is_bool(regex) and to_replace is not None: + raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") + + if value is lib.no_default or method is not lib.no_default: + # GH#36984 if the user explicitly passes value=None we want to + # respect that. We have the corner case where the user explicitly + # passes value=None *and* a method, which we interpret as meaning + # they want the (documented) default behavior. + if method is lib.no_default: + # TODO: get this to show up as the default in the docs? + method = "pad" + + # passing a single value that is scalar like + # when value is None (GH5319), for compat + if not is_dict_like(to_replace) and not is_dict_like(regex): + to_replace = [to_replace] + + if isinstance(to_replace, (tuple, list)): + # TODO: Consider copy-on-write for non-replaced columns's here + if isinstance(self, ABCDataFrame): + from pandas import Series + + result = self.apply( + Series._replace_single, + args=(to_replace, method, inplace, limit), + ) + if inplace: + return None + return result + return self._replace_single(to_replace, method, inplace, limit) + + if not is_dict_like(to_replace): + if not is_dict_like(regex): + raise TypeError( + 'If "to_replace" and "value" are both None ' + 'and "to_replace" is not a list, then ' + "regex must be a mapping" + ) + to_replace = regex + regex = True + + items = list(to_replace.items()) + if items: + keys, values = zip(*items) + else: + # error: Incompatible types in assignment (expression has type + # "list[Never]", variable has type "tuple[Any, ...]") + keys, values = ([], []) # type: ignore[assignment] + + are_mappings = [is_dict_like(v) for v in values] + + if any(are_mappings): + if not all(are_mappings): + raise TypeError( + "If a nested mapping is passed, all values " + "of the top level mapping must be mappings" + ) + # passed a nested dict/Series + to_rep_dict = {} + value_dict = {} + + for k, v in items: + # error: Incompatible types in assignment (expression has type + # "list[Never]", variable has type "tuple[Any, ...]") + keys, values = list(zip(*v.items())) or ( # type: ignore[assignment] + [], + [], + ) + + to_rep_dict[k] = list(keys) + value_dict[k] = list(values) + + to_replace, value = to_rep_dict, value_dict + else: + to_replace, value = keys, values + + return self.replace( + to_replace, value, inplace=inplace, limit=limit, regex=regex + ) + else: + # need a non-zero len on all axes + if not self.size: + if inplace: + return None + return self.copy(deep=None) + + if is_dict_like(to_replace): + if is_dict_like(value): # {'A' : NA} -> {'A' : 0} + # Note: Checking below for `in foo.keys()` instead of + # `in foo` is needed for when we have a Series and not dict + mapping = { + col: (to_replace[col], value[col]) + for col in to_replace.keys() + if col in value.keys() and col in self + } + return self._replace_columnwise(mapping, inplace, regex) + + # {'A': NA} -> 0 + elif not is_list_like(value): + # Operate column-wise + if self.ndim == 1: + raise ValueError( + "Series.replace cannot use dict-like to_replace " + "and non-None value" + ) + mapping = { + col: (to_rep, value) for col, to_rep in to_replace.items() + } + return self._replace_columnwise(mapping, inplace, regex) + else: + raise TypeError("value argument must be scalar, dict, or Series") + + elif is_list_like(to_replace): + if not is_list_like(value): + # e.g. to_replace = [NA, ''] and value is 0, + # so we replace NA with 0 and then replace '' with 0 + value = [value] * len(to_replace) + + # e.g. we have to_replace = [NA, ''] and value = [0, 'missing'] + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " + ) + new_data = self._mgr.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) + + elif to_replace is None: + if not ( + is_re_compilable(regex) + or is_list_like(regex) + or is_dict_like(regex) + ): + raise TypeError( + f"'regex' must be a string or a compiled regular expression " + f"or a list or dict of strings or regular expressions, " + f"you passed a {repr(type(regex).__name__)}" + ) + return self.replace( + regex, value, inplace=inplace, limit=limit, regex=True + ) + else: + # dest iterable dict-like + if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} + # Operate column-wise + if self.ndim == 1: + raise ValueError( + "Series.replace cannot use dict-value and " + "non-None to_replace" + ) + mapping = {col: (to_replace, val) for col, val in value.items()} + return self._replace_columnwise(mapping, inplace, regex) + + elif not is_list_like(value): # NA -> 0 + regex = should_use_regex(regex, to_replace) + if regex: + new_data = self._mgr.replace_regex( + to_replace=to_replace, + value=value, + inplace=inplace, + ) + else: + new_data = self._mgr.replace( + to_replace=to_replace, value=value, inplace=inplace + ) + else: + raise TypeError( + f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' + ) + + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="replace") + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[False] = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[True], + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> None: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: bool_t = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self | None: + ... + + @final + def interpolate( + self, + method: InterpolateOptions = "linear", + *, + axis: Axis = 0, + limit: int | None = None, + inplace: bool_t = False, + limit_direction: Literal["forward", "backward", "both"] | None = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: Literal["infer"] | None | lib.NoDefault = lib.no_default, + **kwargs, + ) -> Self | None: + """ + Fill NaN values using an interpolation method. + + Please note that only ``method='linear'`` is supported for + DataFrame/Series with a MultiIndex. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + + * 'linear': Ignore the index and treat the values as equally + spaced. This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'pad': Fill in NaNs using existing values. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'polynomial': Passed to + `scipy.interpolate.interp1d`, whereas 'spline' is passed to + `scipy.interpolate.UnivariateSpline`. These methods use the numerical + values of the index. Both 'polynomial' and 'spline' require that + you also specify an `order` (int), e.g. + ``df.interpolate(method='polynomial', order=5)``. Note that, + `slinear` method in Pandas refers to the Scipy first order `spline` + instead of Pandas first order `spline`. + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods of + similar names. See `Notes`. + * 'from_derivatives': Refers to + `scipy.interpolate.BPoly.from_derivatives`. + + axis : {{0 or 'index', 1 or 'columns', None}}, default None + Axis to interpolate along. For `Series` this parameter is unused + and defaults to 0. + limit : int, optional + Maximum number of consecutive NaNs to fill. Must be greater than + 0. + inplace : bool, default False + Update the data in place if possible. + limit_direction : {{'forward', 'backward', 'both'}}, Optional + Consecutive NaNs will be filled in this direction. + + If limit is specified: + * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. + * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be + 'backwards'. + + If 'limit' is not specified: + * If 'method' is 'backfill' or 'bfill', the default is 'backward' + * else the default is 'forward' + + raises ValueError if `limit_direction` is 'forward' or 'both' and + method is 'backfill' or 'bfill'. + raises ValueError if `limit_direction` is 'backward' or 'both' and + method is 'pad' or 'ffill'. + + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + downcast : optional, 'infer' or None, defaults to None + Downcast dtypes if possible. + + .. deprecated:: 2.1.0 + + ``**kwargs`` : optional + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + Series or DataFrame or None + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values or None if ``inplace=True``. + + See Also + -------- + fillna : Fill missing values using different methods. + scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials + (Akima interpolator). + scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the + Bernstein basis. + scipy.interpolate.interp1d : Interpolate a 1-D function. + scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh + interpolator). + scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic + interpolation. + scipy.interpolate.CubicSpline : Cubic spline data interpolator. + + Notes + ----- + The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + methods are wrappers around the respective SciPy implementations of + similar names. These use the actual numerical values of the index. + For more information on their behavior, see the + `SciPy documentation + `__. + + Examples + -------- + Filling in ``NaN`` in a :class:`~pandas.Series` via linear + interpolation. + + >>> s = pd.Series([0, 1, np.nan, 3]) + >>> s + 0 0.0 + 1 1.0 + 2 NaN + 3 3.0 + dtype: float64 + >>> s.interpolate() + 0 0.0 + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + + Filling in ``NaN`` in a Series via polynomial interpolation or splines: + Both 'polynomial' and 'spline' methods require that you also specify + an ``order`` (int). + + >>> s = pd.Series([0, 2, np.nan, 8]) + >>> s.interpolate(method='polynomial', order=2) + 0 0.000000 + 1 2.000000 + 2 4.666667 + 3 8.000000 + dtype: float64 + + Fill the DataFrame forward (that is, going down) along each column + using linear interpolation. + + Note how the last entry in column 'a' is interpolated differently, + because there is no entry after it to use for interpolation. + Note how the first entry in column 'b' remains ``NaN``, because there + is no entry before it to use for interpolation. + + >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), + ... (np.nan, 2.0, np.nan, np.nan), + ... (2.0, 3.0, np.nan, 9.0), + ... (np.nan, 4.0, -4.0, 16.0)], + ... columns=list('abcd')) + >>> df + a b c d + 0 0.0 NaN -1.0 1.0 + 1 NaN 2.0 NaN NaN + 2 2.0 3.0 NaN 9.0 + 3 NaN 4.0 -4.0 16.0 + >>> df.interpolate(method='linear', limit_direction='forward', axis=0) + a b c d + 0 0.0 NaN -1.0 1.0 + 1 1.0 2.0 -2.0 5.0 + 2 2.0 3.0 -3.0 9.0 + 3 2.0 4.0 -4.0 16.0 + + Using polynomial interpolation. + + >>> df['d'].interpolate(method='polynomial', order=2) + 0 1.0 + 1 4.0 + 2 9.0 + 3 16.0 + Name: d, dtype: float64 + """ + if downcast is not lib.no_default: + # GH#40988 + warnings.warn( + f"The 'downcast' keyword in {type(self).__name__}.interpolate " + "is deprecated and will be removed in a future version. " + "Call result.infer_objects(copy=False) on the result instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + downcast = None + if downcast is not None and downcast != "infer": + raise ValueError("downcast must be either None or 'infer'") + + inplace = validate_bool_kwarg(inplace, "inplace") + + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + axis = self._get_axis_number(axis) + + if self.empty: + if inplace: + return None + return self.copy() + + if not isinstance(method, str): + raise ValueError("'method' should be a string, not None.") + + fillna_methods = ["ffill", "bfill", "pad", "backfill"] + if method.lower() in fillna_methods: + # GH#53581 + warnings.warn( + f"{type(self).__name__}.interpolate with method={method} is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + obj, should_transpose = self, False + else: + obj, should_transpose = (self.T, True) if axis == 1 else (self, False) + if np.any(obj.dtypes == object): + # GH#53631 + if not (obj.ndim == 2 and np.all(obj.dtypes == object)): + # don't warn in cases that already raise + warnings.warn( + f"{type(self).__name__}.interpolate with object dtype is " + "deprecated and will raise in a future version. Call " + "obj.infer_objects(copy=False) before interpolating instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if method in fillna_methods and "fill_value" in kwargs: + raise ValueError( + "'fill_value' is not a valid keyword for " + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" + ) + + if isinstance(obj.index, MultiIndex) and method != "linear": + raise ValueError( + "Only `method=linear` interpolation is supported on MultiIndexes." + ) + + limit_direction = missing.infer_limit_direction(limit_direction, method) + + if obj.ndim == 2 and np.all(obj.dtypes == object): + raise TypeError( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + + if method.lower() in fillna_methods: + # TODO(3.0): remove this case + # TODO: warn/raise on limit_direction or kwargs which are ignored? + # as of 2023-06-26 no tests get here with either + if not self._mgr.is_single_block and axis == 1: + # GH#53898 + if inplace: + raise NotImplementedError() + obj, axis, should_transpose = self.T, 1 - axis, True + + new_data = obj._mgr.pad_or_backfill( + method=method, + axis=self._get_block_manager_axis(axis), + limit=limit, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + ) + else: + index = missing.get_interp_index(method, obj.index) + new_data = obj._mgr.interpolate( + method=method, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + **kwargs, + ) + + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + if should_transpose: + result = result.T + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="interpolate") + + # ---------------------------------------------------------------------- + # Timeseries methods Methods + + @final + def asof(self, where, subset=None): + """ + Return the last row(s) without any NaNs before `where`. + + The last row (for each element in `where`, if list) without any + NaN is taken. + In case of a :class:`~pandas.DataFrame`, the last row without NaN + considering only the subset of columns (if not `None`) + + If there is no good value, NaN is returned for a Series or + a Series of NaN values for a DataFrame + + Parameters + ---------- + where : date or array-like of dates + Date(s) before which the last row(s) are returned. + subset : str or array-like of str, default `None` + For DataFrame, if not `None`, only use these columns to + check for NaNs. + + Returns + ------- + scalar, Series, or DataFrame + + The return can be: + + * scalar : when `self` is a Series and `where` is a scalar + * Series: when `self` is a Series and `where` is an array-like, + or when `self` is a DataFrame and `where` is a scalar + * DataFrame : when `self` is a DataFrame and `where` is an + array-like + + See Also + -------- + merge_asof : Perform an asof merge. Similar to left join. + + Notes + ----- + Dates are assumed to be sorted. Raises if this is not the case. + + Examples + -------- + A Series and a scalar `where`. + + >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40]) + >>> s + 10 1.0 + 20 2.0 + 30 NaN + 40 4.0 + dtype: float64 + + >>> s.asof(20) + 2.0 + + For a sequence `where`, a Series is returned. The first value is + NaN, because the first element of `where` is before the first + index value. + + >>> s.asof([5, 20]) + 5 NaN + 20 2.0 + dtype: float64 + + Missing values are not considered. The following is ``2.0``, not + NaN, even though NaN is at the index location for ``30``. + + >>> s.asof(30) + 2.0 + + Take all columns into consideration + + >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.], + ... 'b': [None, None, None, None, 500]}, + ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', + ... '2018-02-27 09:02:00', + ... '2018-02-27 09:03:00', + ... '2018-02-27 09:04:00', + ... '2018-02-27 09:05:00'])) + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30'])) + a b + 2018-02-27 09:03:30 NaN NaN + 2018-02-27 09:04:30 NaN NaN + + Take a single column into consideration + + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30']), + ... subset=['a']) + a b + 2018-02-27 09:03:30 30.0 NaN + 2018-02-27 09:04:30 40.0 NaN + """ + if isinstance(where, str): + where = Timestamp(where) + + if not self.index.is_monotonic_increasing: + raise ValueError("asof requires a sorted index") + + is_series = isinstance(self, ABCSeries) + if is_series: + if subset is not None: + raise ValueError("subset is not valid for Series") + else: + if subset is None: + subset = self.columns + if not is_list_like(subset): + subset = [subset] + + is_list = is_list_like(where) + if not is_list: + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq) + + if where < start: + if not is_series: + return self._constructor_sliced( + index=self.columns, name=where, dtype=np.float64 + ) + return np.nan + + # It's always much faster to use a *while* loop here for + # Series than pre-computing all the NAs. However a + # *while* loop is extremely expensive for DataFrame + # so we later pre-compute all the NAs and use the same + # code path whether *where* is a scalar or list. + # See PR: https://github.com/pandas-dev/pandas/pull/14476 + if is_series: + loc = self.index.searchsorted(where, side="right") + if loc > 0: + loc -= 1 + + values = self._values + while loc > 0 and isna(values[loc]): + loc -= 1 + return values[loc] + + if not isinstance(where, Index): + where = Index(where) if is_list else Index([where]) + + nulls = self.isna() if is_series else self[subset].isna().any(axis=1) + if nulls.all(): + if is_series: + self = cast("Series", self) + return self._constructor(np.nan, index=where, name=self.name) + elif is_list: + self = cast("DataFrame", self) + return self._constructor(np.nan, index=where, columns=self.columns) + else: + self = cast("DataFrame", self) + return self._constructor_sliced( + np.nan, index=self.columns, name=where[0] + ) + + locs = self.index.asof_locs(where, ~(nulls._values)) + + # mask the missing + mask = locs == -1 + data = self.take(locs) + data.index = where + if mask.any(): + # GH#16063 only do this setting when necessary, otherwise + # we'd cast e.g. bools to floats + data.loc[mask] = np.nan + return data if is_list else data.iloc[-1] + + # ---------------------------------------------------------------------- + # Action Methods + + @doc(klass=_shared_doc_kwargs["klass"]) + def isna(self) -> Self: + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as None or :attr:`numpy.NaN`, gets mapped to True + values. + Everything else gets mapped to False values. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + Returns + ------- + {klass} + Mask of bool values for each element in {klass} that + indicates whether an element is an NA value. + + See Also + -------- + {klass}.isnull : Alias of isna. + {klass}.notna : Boolean inverse of isna. + {klass}.dropna : Omit axes labels with missing values. + isna : Top-level isna. + + Examples + -------- + Show which entries in a DataFrame are NA. + + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) + >>> df + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.isna() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + Show which entries in a Series are NA. + + >>> ser = pd.Series([5, 6, np.nan]) + >>> ser + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.isna() + 0 False + 1 False + 2 True + dtype: bool + """ + return isna(self).__finalize__(self, method="isna") + + @doc(isna, klass=_shared_doc_kwargs["klass"]) + def isnull(self) -> Self: + return isna(self).__finalize__(self, method="isnull") + + @doc(klass=_shared_doc_kwargs["klass"]) + def notna(self) -> Self: + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to True. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to False + values. + + Returns + ------- + {klass} + Mask of bool values for each element in {klass} that + indicates whether an element is not an NA value. + + See Also + -------- + {klass}.notnull : Alias of notna. + {klass}.isna : Boolean inverse of notna. + {klass}.dropna : Omit axes labels with missing values. + notna : Top-level notna. + + Examples + -------- + Show which entries in a DataFrame are not NA. + + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) + >>> df + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.notna() + age born name toy + 0 True False True False + 1 True True True True + 2 False True True True + + Show which entries in a Series are not NA. + + >>> ser = pd.Series([5, 6, np.nan]) + >>> ser + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.notna() + 0 True + 1 True + 2 False + dtype: bool + """ + return notna(self).__finalize__(self, method="notna") + + @doc(notna, klass=_shared_doc_kwargs["klass"]) + def notnull(self) -> Self: + return notna(self).__finalize__(self, method="notnull") + + @final + def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): + if (lower is not None and np.any(isna(lower))) or ( + upper is not None and np.any(isna(upper)) + ): + raise ValueError("Cannot use an NA value as a clip threshold") + + result = self + mask = self.isna() + + if lower is not None: + cond = mask | (self >= lower) + result = result.where( + cond, lower, inplace=inplace + ) # type: ignore[assignment] + if upper is not None: + cond = mask | (self <= upper) + result = self if inplace else result + result = result.where( + cond, upper, inplace=inplace + ) # type: ignore[assignment] + + return result + + @final + def _clip_with_one_bound(self, threshold, method, axis, inplace): + if axis is not None: + axis = self._get_axis_number(axis) + + # method is self.le for upper bound and self.ge for lower bound + if is_scalar(threshold) and is_number(threshold): + if method.__name__ == "le": + return self._clip_with_scalar(None, threshold, inplace=inplace) + return self._clip_with_scalar(threshold, None, inplace=inplace) + + # GH #15390 + # In order for where method to work, the threshold must + # be transformed to NDFrame from other array like structure. + if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold): + if isinstance(self, ABCSeries): + threshold = self._constructor(threshold, index=self.index) + else: + threshold = self._align_for_op(threshold, axis, flex=None)[1] + + # GH 40420 + # Treat missing thresholds as no bounds, not clipping the values + if is_list_like(threshold): + fill_value = np.inf if method.__name__ == "le" else -np.inf + threshold_inf = threshold.fillna(fill_value) + else: + threshold_inf = threshold + + subset = method(threshold_inf, axis=axis) | isna(self) + + # GH 40420 + return self.where(subset, threshold, axis=axis, inplace=inplace) + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[False] = ..., + **kwargs, + ) -> Self: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: bool_t = ..., + **kwargs, + ) -> Self | None: + ... + + @final + def clip( + self, + lower=None, + upper=None, + *, + axis: Axis | None = None, + inplace: bool_t = False, + **kwargs, + ) -> Self | None: + """ + Trim values at input threshold(s). + + Assigns values outside boundary to boundary values. Thresholds + can be singular values or array like, and in the latter case + the clipping is performed element-wise in the specified axis. + + Parameters + ---------- + lower : float or array-like, default None + Minimum threshold value. All values below this + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. + upper : float or array-like, default None + Maximum threshold value. All values above this + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. + axis : {{0 or 'index', 1 or 'columns', None}}, default None + Align object with lower and upper along the given axis. + For `Series` this parameter is unused and defaults to `None`. + inplace : bool, default False + Whether to perform the operation in place on the data. + *args, **kwargs + Additional keywords have no effect but might be accepted + for compatibility with numpy. + + Returns + ------- + Series or DataFrame or None + Same type as calling object with the values outside the + clip boundaries replaced or None if ``inplace=True``. + + See Also + -------- + Series.clip : Trim values at input threshold in series. + DataFrame.clip : Trim values at input threshold in dataframe. + numpy.clip : Clip (limit) the values in an array. + + Examples + -------- + >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} + >>> df = pd.DataFrame(data) + >>> df + col_0 col_1 + 0 9 -2 + 1 -3 -7 + 2 0 6 + 3 -1 8 + 4 5 -5 + + Clips per column using lower and upper thresholds: + + >>> df.clip(-4, 6) + col_0 col_1 + 0 6 -2 + 1 -3 -4 + 2 0 6 + 3 -1 6 + 4 5 -4 + + Clips using specific lower and upper thresholds per column: + + >>> df.clip([-2, -1], [4, 5]) + col_0 col_1 + 0 4 -1 + 1 -2 -1 + 2 0 5 + 3 -1 5 + 4 4 -1 + + Clips using specific lower and upper thresholds per column element: + + >>> t = pd.Series([2, -4, -1, 6, 3]) + >>> t + 0 2 + 1 -4 + 2 -1 + 3 6 + 4 3 + dtype: int64 + + >>> df.clip(t, t + 4, axis=0) + col_0 col_1 + 0 6 2 + 1 -3 -4 + 2 0 3 + 3 6 8 + 4 5 3 + + Clips using specific lower threshold per column element, with missing values: + + >>> t = pd.Series([2, -4, np.nan, 6, 3]) + >>> t + 0 2.0 + 1 -4.0 + 2 NaN + 3 6.0 + 4 3.0 + dtype: float64 + + >>> df.clip(t, axis=0) + col_0 col_1 + 0 9 2 + 1 -3 -4 + 2 0 6 + 3 6 8 + 4 5 3 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + axis = nv.validate_clip_with_axis(axis, (), kwargs) + if axis is not None: + axis = self._get_axis_number(axis) + + # GH 17276 + # numpy doesn't like NaN as a clip value + # so ignore + # GH 19992 + # numpy doesn't drop a list-like bound containing NaN + isna_lower = isna(lower) + if not is_list_like(lower): + if np.any(isna_lower): + lower = None + elif np.all(isna_lower): + lower = None + isna_upper = isna(upper) + if not is_list_like(upper): + if np.any(isna_upper): + upper = None + elif np.all(isna_upper): + upper = None + + # GH 2747 (arguments were reversed) + if ( + lower is not None + and upper is not None + and is_scalar(lower) + and is_scalar(upper) + ): + lower, upper = min(lower, upper), max(lower, upper) + + # fast-path for scalars + if (lower is None or is_number(lower)) and (upper is None or is_number(upper)): + return self._clip_with_scalar(lower, upper, inplace=inplace) + + result = self + if lower is not None: + result = result._clip_with_one_bound( + lower, method=self.ge, axis=axis, inplace=inplace + ) + if upper is not None: + if inplace: + result = self + result = result._clip_with_one_bound( + upper, method=self.le, axis=axis, inplace=inplace + ) + + return result + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def asfreq( + self, + freq: Frequency, + method: FillnaOptions | None = None, + how: Literal["start", "end"] | None = None, + normalize: bool_t = False, + fill_value: Hashable | None = None, + ) -> Self: + """ + Convert time series to specified frequency. + + Returns the original data conformed to a new index with the specified + frequency. + + If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index + is the result of transforming the original index with + :meth:`PeriodIndex.asfreq ` (so the original index + will map one-to-one to the new index). + + Otherwise, the new index will be equivalent to ``pd.date_range(start, end, + freq=freq)`` where ``start`` and ``end`` are, respectively, the first and + last entries in the original index (see :func:`pandas.date_range`). The + values corresponding to any timesteps in the new index which were not present + in the original index will be null (``NaN``), unless a method for filling + such unknowns is provided (see the ``method`` parameter below). + + The :meth:`resample` method is more appropriate if an operation on each group of + timesteps (such as an aggregate) is necessary to represent the data at the new + frequency. + + Parameters + ---------- + freq : DateOffset or str + Frequency DateOffset or string. + method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None + Method to use for filling holes in reindexed Series (note this + does not fill NaNs that already were present): + + * 'pad' / 'ffill': propagate last valid observation forward to next + valid + * 'backfill' / 'bfill': use NEXT valid observation to fill. + how : {{'start', 'end'}}, default end + For PeriodIndex only (see PeriodIndex.asfreq). + normalize : bool, default False + Whether to reset output index to midnight. + fill_value : scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + Returns + ------- + {klass} + {klass} object reindexed to the specified frequency. + + See Also + -------- + reindex : Conform DataFrame to new index with optional filling logic. + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + Start by creating a series with 4 one minute timestamps. + + >>> index = pd.date_range('1/1/2000', periods=4, freq='min') + >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) + >>> df = pd.DataFrame({{'s': series}}) + >>> df + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:03:00 3.0 + + Upsample the series into 30 second bins. + + >>> df.asfreq(freq='30s') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 NaN + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``fill value``. + + >>> df.asfreq(freq='30s', fill_value=9.0) + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 9.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 9.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 9.0 + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``method``. + + >>> df.asfreq(freq='30s', method='bfill') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 2.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 3.0 + 2000-01-01 00:03:00 3.0 + """ + from pandas.core.resample import asfreq + + return asfreq( + self, + freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) + + @final + def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: + """ + Select values at particular time of day (e.g., 9:30AM). + + Parameters + ---------- + time : datetime.time or str + The values to select. + axis : {0 or 'index', 1 or 'columns'}, default 0 + For `Series` this parameter is unused and defaults to 0. + + Returns + ------- + Series or DataFrame + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + between_time : Select values between particular times of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. + DatetimeIndex.indexer_at_time : Get just the index locations for + values at particular time of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 00:00:00 1 + 2018-04-09 12:00:00 2 + 2018-04-10 00:00:00 3 + 2018-04-10 12:00:00 4 + + >>> ts.at_time('12:00') + A + 2018-04-09 12:00:00 2 + 2018-04-10 12:00:00 4 + """ + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) + + if not isinstance(index, DatetimeIndex): + raise TypeError("Index must be DatetimeIndex") + + indexer = index.indexer_at_time(time, asof=asof) + return self._take_with_is_copy(indexer, axis=axis) + + @final + def between_time( + self, + start_time, + end_time, + inclusive: IntervalClosedType = "both", + axis: Axis | None = None, + ) -> Self: + """ + Select values between particular times of the day (e.g., 9:00-9:30 AM). + + By setting ``start_time`` to be later than ``end_time``, + you can get the times that are *not* between the two times. + + Parameters + ---------- + start_time : datetime.time or str + Initial time as a time filter limit. + end_time : datetime.time or str + End time as a time filter limit. + inclusive : {"both", "neither", "left", "right"}, default "both" + Include boundaries; whether to set each bound as closed or open. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine range time on index or columns value. + For `Series` this parameter is unused and defaults to 0. + + Returns + ------- + Series or DataFrame + Data from the original object filtered to the specified dates range. + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + at_time : Select values at a particular time of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. + DatetimeIndex.indexer_between_time : Get just the index locations for + values between particular times of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 00:00:00 1 + 2018-04-10 00:20:00 2 + 2018-04-11 00:40:00 3 + 2018-04-12 01:00:00 4 + + >>> ts.between_time('0:15', '0:45') + A + 2018-04-10 00:20:00 2 + 2018-04-11 00:40:00 3 + + You get the times that are *not* between two times by setting + ``start_time`` later than ``end_time``: + + >>> ts.between_time('0:45', '0:15') + A + 2018-04-09 00:00:00 1 + 2018-04-12 01:00:00 4 + """ + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) + if not isinstance(index, DatetimeIndex): + raise TypeError("Index must be DatetimeIndex") + + left_inclusive, right_inclusive = validate_inclusive(inclusive) + indexer = index.indexer_between_time( + start_time, + end_time, + include_start=left_inclusive, + include_end=right_inclusive, + ) + return self._take_with_is_copy(indexer, axis=axis) + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def resample( + self, + rule, + axis: Axis | lib.NoDefault = lib.no_default, + closed: Literal["right", "left"] | None = None, + label: Literal["right", "left"] | None = None, + convention: Literal["start", "end", "s", "e"] | lib.NoDefault = lib.no_default, + kind: Literal["timestamp", "period"] | None | lib.NoDefault = lib.no_default, + on: Level | None = None, + level: Level | None = None, + origin: str | TimestampConvertibleTypes = "start_day", + offset: TimedeltaConvertibleTypes | None = None, + group_keys: bool_t = False, + ) -> Resampler: + """ + Resample time-series data. + + Convenience method for frequency conversion and resampling of time series. + The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`, + or `TimedeltaIndex`), or the caller must pass the label of a datetime-like + series/index to the ``on``/``level`` keyword parameter. + + Parameters + ---------- + rule : DateOffset, Timedelta or str + The offset string or object representing target conversion. + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + Which axis to use for up- or down-sampling. For `Series` this parameter + is unused and defaults to 0. Must be + `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. + + .. deprecated:: 2.0.0 + Use frame.T.resample(...) instead. + closed : {{'right', 'left'}}, default None + Which side of bin interval is closed. The default is 'left' + for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', + 'BA', 'BQE', and 'W' which all have a default of 'right'. + label : {{'right', 'left'}}, default None + Which bin edge label to label bucket with. The default is 'left' + for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', + 'BA', 'BQE', and 'W' which all have a default of 'right'. + convention : {{'start', 'end', 's', 'e'}}, default 'start' + For `PeriodIndex` only, controls whether to use the start or + end of `rule`. + + .. deprecated:: 2.2.0 + Convert PeriodIndex to DatetimeIndex before resampling instead. + kind : {{'timestamp', 'period'}}, optional, default None + Pass 'timestamp' to convert the resulting index to a + `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`. + By default the input representation is retained. + + .. deprecated:: 2.2.0 + Convert index to desired type explicitly instead. + + on : str, optional + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. + level : str or int, optional + For a MultiIndex, level (name or number) to use for + resampling. `level` must be datetime-like. + origin : Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + If string, must be one of the following: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.3.0 + + .. note:: + + Only takes effect for Tick-frequencies (i.e. fixed frequencies like + days, hours, and minutes, rather than months or quarters). + offset : Timedelta or str, default is None + An offset timedelta added to the origin. + + group_keys : bool, default False + Whether to include the group keys in the result index when using + ``.apply()`` on the resampled object. + + .. versionadded:: 1.5.0 + + Not specifying ``group_keys`` will retain values-dependent behavior + from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes + ` for examples). + + .. versionchanged:: 2.0.0 + + ``group_keys`` now defaults to ``False``. + + Returns + ------- + pandas.api.typing.Resampler + :class:`~pandas.core.Resampler` object. + + See Also + -------- + Series.resample : Resample a Series. + DataFrame.resample : Resample a DataFrame. + groupby : Group {klass} by mapping, function, label, or list of labels. + asfreq : Reindex a {klass} with the given frequency without grouping. + + Notes + ----- + See the `user guide + `__ + for more. + + To learn more about the offset strings, please see `this link + `__. + + Examples + -------- + Start by creating a series with 9 one minute timestamps. + + >>> index = pd.date_range('1/1/2000', periods=9, freq='min') + >>> series = pd.Series(range(9), index=index) + >>> series + 2000-01-01 00:00:00 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:02:00 2 + 2000-01-01 00:03:00 3 + 2000-01-01 00:04:00 4 + 2000-01-01 00:05:00 5 + 2000-01-01 00:06:00 6 + 2000-01-01 00:07:00 7 + 2000-01-01 00:08:00 8 + Freq: min, dtype: int64 + + Downsample the series into 3 minute bins and sum the values + of the timestamps falling into a bin. + + >>> series.resample('3min').sum() + 2000-01-01 00:00:00 3 + 2000-01-01 00:03:00 12 + 2000-01-01 00:06:00 21 + Freq: 3min, dtype: int64 + + Downsample the series into 3 minute bins as above, but label each + bin using the right edge instead of the left. Please note that the + value in the bucket used as the label is not included in the bucket, + which it labels. For example, in the original series the + bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed + value in the resampled bucket with the label ``2000-01-01 00:03:00`` + does not include 3 (if it did, the summed value would be 6, not 3). + + >>> series.resample('3min', label='right').sum() + 2000-01-01 00:03:00 3 + 2000-01-01 00:06:00 12 + 2000-01-01 00:09:00 21 + Freq: 3min, dtype: int64 + + To include this value close the right side of the bin interval, + as shown below. + + >>> series.resample('3min', label='right', closed='right').sum() + 2000-01-01 00:00:00 0 + 2000-01-01 00:03:00 6 + 2000-01-01 00:06:00 15 + 2000-01-01 00:09:00 15 + Freq: 3min, dtype: int64 + + Upsample the series into 30 second bins. + + >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 1.0 + 2000-01-01 00:01:30 NaN + 2000-01-01 00:02:00 2.0 + Freq: 30s, dtype: float64 + + Upsample the series into 30 second bins and fill the ``NaN`` + values using the ``ffill`` method. + + >>> series.resample('30s').ffill()[0:5] + 2000-01-01 00:00:00 0 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 1 + 2000-01-01 00:02:00 2 + Freq: 30s, dtype: int64 + + Upsample the series into 30 second bins and fill the + ``NaN`` values using the ``bfill`` method. + + >>> series.resample('30s').bfill()[0:5] + 2000-01-01 00:00:00 0 + 2000-01-01 00:00:30 1 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 2 + 2000-01-01 00:02:00 2 + Freq: 30s, dtype: int64 + + Pass a custom function via ``apply`` + + >>> def custom_resampler(arraylike): + ... return np.sum(arraylike) + 5 + ... + >>> series.resample('3min').apply(custom_resampler) + 2000-01-01 00:00:00 8 + 2000-01-01 00:03:00 17 + 2000-01-01 00:06:00 26 + Freq: 3min, dtype: int64 + + For DataFrame objects, the keyword `on` can be used to specify the + column instead of the index for resampling. + + >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} + >>> df = pd.DataFrame(d) + >>> df['week_starting'] = pd.date_range('01/01/2018', + ... periods=8, + ... freq='W') + >>> df + price volume week_starting + 0 10 50 2018-01-07 + 1 11 60 2018-01-14 + 2 9 40 2018-01-21 + 3 13 100 2018-01-28 + 4 14 50 2018-02-04 + 5 18 100 2018-02-11 + 6 17 40 2018-02-18 + 7 19 50 2018-02-25 + >>> df.resample('ME', on='week_starting').mean() + price volume + week_starting + 2018-01-31 10.75 62.5 + 2018-02-28 17.00 60.0 + + For a DataFrame with MultiIndex, the keyword `level` can be used to + specify on which level the resampling needs to take place. + + >>> days = pd.date_range('1/1/2000', periods=4, freq='D') + >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} + >>> df2 = pd.DataFrame( + ... d2, + ... index=pd.MultiIndex.from_product( + ... [days, ['morning', 'afternoon']] + ... ) + ... ) + >>> df2 + price volume + 2000-01-01 morning 10 50 + afternoon 11 60 + 2000-01-02 morning 9 40 + afternoon 13 100 + 2000-01-03 morning 14 50 + afternoon 18 100 + 2000-01-04 morning 17 40 + afternoon 19 50 + >>> df2.resample('D', level=0).sum() + price volume + 2000-01-01 21 110 + 2000-01-02 22 140 + 2000-01-03 32 150 + 2000-01-04 36 90 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7min, dtype: int64 + + >>> ts.resample('17min').sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17min, dtype: int64 + + >>> ts.resample('17min', origin='epoch').sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17min, dtype: int64 + + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.resample('17min', origin='start').sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + >>> ts.resample('17min', offset='23h30min').sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + If you want to take the largest Timestamp as the end of the bins: + + >>> ts.resample('17min', origin='end').sum() + 2000-10-01 23:35:00 0 + 2000-10-01 23:52:00 18 + 2000-10-02 00:09:00 27 + 2000-10-02 00:26:00 63 + Freq: 17min, dtype: int64 + + In contrast with the `start_day`, you can use `end_day` to take the ceiling + midnight of the largest Timestamp as the end of the bins and drop the bins + not containing data: + + >>> ts.resample('17min', origin='end_day').sum() + 2000-10-01 23:38:00 3 + 2000-10-01 23:55:00 15 + 2000-10-02 00:12:00 45 + 2000-10-02 00:29:00 45 + Freq: 17min, dtype: int64 + """ + from pandas.core.resample import get_resampler + + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + if axis == 1: + warnings.warn( + "DataFrame.resample with axis=1 is deprecated. Do " + "`frame.T.resample(...)` without axis instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + + if kind is not lib.no_default: + # GH#55895 + warnings.warn( + f"The 'kind' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version. " + "Explicitly cast the index to the desired type instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + kind = None + + if convention is not lib.no_default: + warnings.warn( + f"The 'convention' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version. " + "Explicitly cast PeriodIndex to DatetimeIndex before resampling " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + convention = "start" + + return get_resampler( + cast("Series | DataFrame", self), + freq=rule, + label=label, + closed=closed, + axis=axis, + kind=kind, + convention=convention, + key=on, + level=level, + origin=origin, + offset=offset, + group_keys=group_keys, + ) + + @final + def first(self, offset) -> Self: + """ + Select initial periods of time series data based on a date offset. + + .. deprecated:: 2.1 + :meth:`.first` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + + For a DataFrame with a sorted DatetimeIndex, this function can + select the first few rows based on a date offset. + + Parameters + ---------- + offset : str, DateOffset or dateutil.relativedelta + The offset length of the data that will be selected. For instance, + '1ME' will display all the rows having their index within the first month. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + last : Select final periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + + Get the rows for the first 3 days: + + >>> ts.first('3D') + A + 2018-04-09 1 + 2018-04-11 2 + + Notice the data for 3 first calendar days were returned, not the first + 3 days observed in the dataset, and therefore data for 2018-04-13 was + not returned. + """ + warnings.warn( + "first is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + if not isinstance(self.index, DatetimeIndex): + raise TypeError("'first' only supports a DatetimeIndex index") + + if len(self.index) == 0: + return self.copy(deep=False) + + offset = to_offset(offset) + if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]): + # GH#29623 if first value is end of period, remove offset with n = 1 + # before adding the real offset + end_date = end = self.index[0] - offset.base + offset + else: + end_date = end = self.index[0] + offset + + # Tick-like, e.g. 3 weeks + if isinstance(offset, Tick) and end_date in self.index: + end = self.index.searchsorted(end_date, side="left") + return self.iloc[:end] + + return self.loc[:end] + + @final + def last(self, offset) -> Self: + """ + Select final periods of time series data based on a date offset. + + .. deprecated:: 2.1 + :meth:`.last` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + + For a DataFrame with a sorted DatetimeIndex, this function + selects the last few rows based on a date offset. + + Parameters + ---------- + offset : str, DateOffset, dateutil.relativedelta + The offset length of the data that will be selected. For instance, + '3D' will display all the rows having their index within the last 3 days. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + first : Select initial periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. + + Notes + ----- + .. deprecated:: 2.1.0 + Please create a mask and filter using `.loc` instead + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + + Get the rows for the last 3 days: + + >>> ts.last('3D') # doctest: +SKIP + A + 2018-04-13 3 + 2018-04-15 4 + + Notice the data for 3 last calendar days were returned, not the last + 3 observed days in the dataset, and therefore data for 2018-04-11 was + not returned. + """ + warnings.warn( + "last is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if not isinstance(self.index, DatetimeIndex): + raise TypeError("'last' only supports a DatetimeIndex index") + + if len(self.index) == 0: + return self.copy(deep=False) + + offset = to_offset(offset) + + start_date = self.index[-1] - offset + start = self.index.searchsorted(start_date, side="right") + return self.iloc[start:] + + @final + def rank( + self, + axis: Axis = 0, + method: Literal["average", "min", "max", "first", "dense"] = "average", + numeric_only: bool_t = False, + na_option: Literal["keep", "top", "bottom"] = "keep", + ascending: bool_t = True, + pct: bool_t = False, + ) -> Self: + """ + Compute numerical data ranks (1 through n) along axis. + + By default, equal values are assigned a rank that is the average of the + ranks of those values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Index to direct ranking. + For `Series` this parameter is unused and defaults to 0. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups. + + numeric_only : bool, default False + For DataFrame objects, rank only numeric columns if set to True. + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + na_option : {'keep', 'top', 'bottom'}, default 'keep' + How to rank NaN values: + + * keep: assign NaN rank to NaN values + * top: assign lowest rank to NaN values + * bottom: assign highest rank to NaN values + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + + Returns + ------- + same type as caller + Return a Series or DataFrame with data ranks as values. + + See Also + -------- + core.groupby.DataFrameGroupBy.rank : Rank of values within each group. + core.groupby.SeriesGroupBy.rank : Rank of values within each group. + + Examples + -------- + >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog', + ... 'spider', 'snake'], + ... 'Number_legs': [4, 2, 4, 8, np.nan]}) + >>> df + Animal Number_legs + 0 cat 4.0 + 1 penguin 2.0 + 2 dog 4.0 + 3 spider 8.0 + 4 snake NaN + + Ties are assigned the mean of the ranks (by default) for the group. + + >>> s = pd.Series(range(5), index=list("abcde")) + >>> s["d"] = s["b"] + >>> s.rank() + a 1.0 + b 2.5 + c 4.0 + d 2.5 + e 5.0 + dtype: float64 + + The following example shows how the method behaves with the above + parameters: + + * default_rank: this is the default behaviour obtained without using + any parameter. + * max_rank: setting ``method = 'max'`` the records that have the + same values are ranked using the highest rank (e.g.: since 'cat' + and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.) + * NA_bottom: choosing ``na_option = 'bottom'``, if there are records + with NaN values they are placed at the bottom of the ranking. + * pct_rank: when setting ``pct = True``, the ranking is expressed as + percentile rank. + + >>> df['default_rank'] = df['Number_legs'].rank() + >>> df['max_rank'] = df['Number_legs'].rank(method='max') + >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom') + >>> df['pct_rank'] = df['Number_legs'].rank(pct=True) + >>> df + Animal Number_legs default_rank max_rank NA_bottom pct_rank + 0 cat 4.0 2.5 3.0 2.5 0.625 + 1 penguin 2.0 1.0 1.0 1.0 0.250 + 2 dog 4.0 2.5 3.0 2.5 0.625 + 3 spider 8.0 4.0 4.0 4.0 1.000 + 4 snake NaN NaN NaN 5.0 NaN + """ + axis_int = self._get_axis_number(axis) + + if na_option not in {"keep", "top", "bottom"}: + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + raise ValueError(msg) + + def ranker(data): + if data.ndim == 2: + # i.e. DataFrame, we cast to ndarray + values = data.values + else: + # i.e. Series, can dispatch to EA + values = data._values + + if isinstance(values, ExtensionArray): + ranks = values._rank( + axis=axis_int, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + else: + ranks = algos.rank( + values, + axis=axis_int, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + + ranks_obj = self._constructor(ranks, **data._construct_axes_dict()) + return ranks_obj.__finalize__(self, method="rank") + + if numeric_only: + if self.ndim == 1 and not is_numeric_dtype(self.dtype): + # GH#47500 + raise TypeError( + "Series.rank does not allow numeric_only=True with " + "non-numeric dtype." + ) + data = self._get_numeric_data() + else: + data = self + + return ranker(data) + + @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) + def compare( + self, + other, + align_axis: Axis = 1, + keep_shape: bool_t = False, + keep_equal: bool_t = False, + result_names: Suffixes = ("self", "other"), + ): + if type(self) is not type(other): + cls_self, cls_other = type(self).__name__, type(other).__name__ + raise TypeError( + f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'" + ) + + mask = ~((self == other) | (self.isna() & other.isna())) + mask.fillna(True, inplace=True) + + if not keep_equal: + self = self.where(mask) + other = other.where(mask) + + if not keep_shape: + if isinstance(self, ABCDataFrame): + cmask = mask.any() + rmask = mask.any(axis=1) + self = self.loc[rmask, cmask] + other = other.loc[rmask, cmask] + else: + self = self[mask] + other = other[mask] + if not isinstance(result_names, tuple): + raise TypeError( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ) + + if align_axis in (1, "columns"): # This is needed for Series + axis = 1 + else: + axis = self._get_axis_number(align_axis) + + # error: List item 0 has incompatible type "NDFrame"; expected + # "Union[Series, DataFrame]" + diff = concat( + [self, other], # type: ignore[list-item] + axis=axis, + keys=result_names, + ) + + if axis >= self.ndim: + # No need to reorganize data if stacking on new axis + # This currently applies for stacking two Series on columns + return diff + + ax = diff._get_axis(axis) + ax_names = np.array(ax.names) + + # set index names to positions to avoid confusion + ax.names = np.arange(len(ax_names)) + + # bring self-other to inner level + order = list(range(1, ax.nlevels)) + [0] + if isinstance(diff, ABCDataFrame): + diff = diff.reorder_levels(order, axis=axis) + else: + diff = diff.reorder_levels(order) + + # restore the index names in order + diff._get_axis(axis=axis).names = ax_names[order] + + # reorder axis to keep things organized + indices = ( + np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + ) + diff = diff.take(indices, axis=axis) + + return diff + + @final + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) + def align( + self, + other: NDFrameT, + join: AlignJoin = "outer", + axis: Axis | None = None, + level: Level | None = None, + copy: bool_t | None = None, + fill_value: Hashable | None = None, + method: FillnaOptions | None | lib.NoDefault = lib.no_default, + limit: int | None | lib.NoDefault = lib.no_default, + fill_axis: Axis | lib.NoDefault = lib.no_default, + broadcast_axis: Axis | None | lib.NoDefault = lib.no_default, + ) -> tuple[Self, NDFrameT]: + """ + Align two objects on their axes with the specified join method. + + Join method is specified for each axis Index. + + Parameters + ---------- + other : DataFrame or Series + join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' + Type of alignment to be performed. + + * left: use only keys from left frame, preserve key order. + * right: use only keys from right frame, preserve key order. + * outer: use union of keys from both frames, sort keys lexicographically. + * inner: use intersection of keys from both frames, + preserve the order of the left keys. + + axis : allowed axis of the other object, default None + Align on index (0), columns (1), or both (None). + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level. + copy : bool, default True + Always returns new objects. If copy=False and no reindexing is + required then original objects are returned. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None + Method to use for filling holes in reindexed Series: + + - pad / ffill: propagate last valid observation forward to next valid. + - backfill / bfill: use NEXT valid observation to fill gap. + + .. deprecated:: 2.1 + + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + + .. deprecated:: 2.1 + + fill_axis : {axes_single_arg}, default 0 + Filling axis, method and limit. + + .. deprecated:: 2.1 + + broadcast_axis : {axes_single_arg}, default None + Broadcast values along this axis, if aligning two objects of + different dimensions. + + .. deprecated:: 2.1 + + Returns + ------- + tuple of ({klass}, type of other) + Aligned objects. + + Examples + -------- + >>> df = pd.DataFrame( + ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2] + ... ) + >>> other = pd.DataFrame( + ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]], + ... columns=["A", "B", "C", "D"], + ... index=[2, 3, 4], + ... ) + >>> df + D B E A + 1 1 2 3 4 + 2 6 7 8 9 + >>> other + A B C D + 2 10 20 30 40 + 3 60 70 80 90 + 4 600 700 800 900 + + Align on columns: + + >>> left, right = df.align(other, join="outer", axis=1) + >>> left + A B C D E + 1 4 2 NaN 1 3 + 2 9 7 NaN 6 8 + >>> right + A B C D E + 2 10 20 30 40 NaN + 3 60 70 80 90 NaN + 4 600 700 800 900 NaN + + We can also align on the index: + + >>> left, right = df.align(other, join="outer", axis=0) + >>> left + D B E A + 1 1.0 2.0 3.0 4.0 + 2 6.0 7.0 8.0 9.0 + 3 NaN NaN NaN NaN + 4 NaN NaN NaN NaN + >>> right + A B C D + 1 NaN NaN NaN NaN + 2 10.0 20.0 30.0 40.0 + 3 60.0 70.0 80.0 90.0 + 4 600.0 700.0 800.0 900.0 + + Finally, the default `axis=None` will align on both index and columns: + + >>> left, right = df.align(other, join="outer", axis=None) + >>> left + A B C D E + 1 4.0 2.0 NaN 1.0 3.0 + 2 9.0 7.0 NaN 6.0 8.0 + 3 NaN NaN NaN NaN NaN + 4 NaN NaN NaN NaN NaN + >>> right + A B C D E + 1 NaN NaN NaN NaN NaN + 2 10.0 20.0 30.0 40.0 NaN + 3 60.0 70.0 80.0 90.0 NaN + 4 600.0 700.0 800.0 900.0 NaN + """ + if ( + method is not lib.no_default + or limit is not lib.no_default + or fill_axis is not lib.no_default + ): + # GH#51856 + warnings.warn( + "The 'method', 'limit', and 'fill_axis' keywords in " + f"{type(self).__name__}.align are deprecated and will be removed " + "in a future version. Call fillna directly on the returned objects " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if fill_axis is lib.no_default: + fill_axis = 0 + if method is lib.no_default: + method = None + if limit is lib.no_default: + limit = None + + if method is not None: + method = clean_fill_method(method) + + if broadcast_axis is not lib.no_default: + # GH#51856 + # TODO(3.0): enforcing this deprecation will close GH#13194 + msg = ( + f"The 'broadcast_axis' keyword in {type(self).__name__}.align is " + "deprecated and will be removed in a future version." + ) + if broadcast_axis is not None: + if self.ndim == 1 and other.ndim == 2: + msg += ( + " Use left = DataFrame({col: left for col in right.columns}, " + "index=right.index) before calling `left.align(right)` instead." + ) + elif self.ndim == 2 and other.ndim == 1: + msg += ( + " Use right = DataFrame({col: right for col in left.columns}, " + "index=left.index) before calling `left.align(right)` instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + else: + broadcast_axis = None + + if broadcast_axis == 1 and self.ndim != other.ndim: + if isinstance(self, ABCSeries): + # this means other is a DataFrame, and we need to broadcast + # self + cons = self._constructor_expanddim + df = cons( + {c: self for c in other.columns}, **other._construct_axes_dict() + ) + # error: Incompatible return value type (got "Tuple[DataFrame, + # DataFrame]", expected "Tuple[Self, NDFrameT]") + return df._align_frame( # type: ignore[return-value] + other, # type: ignore[arg-type] + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + )[:2] + elif isinstance(other, ABCSeries): + # this means self is a DataFrame, and we need to broadcast + # other + cons = other._constructor_expanddim + df = cons( + {c: other for c in self.columns}, **self._construct_axes_dict() + ) + # error: Incompatible return value type (got "Tuple[NDFrameT, + # DataFrame]", expected "Tuple[Self, NDFrameT]") + return self._align_frame( # type: ignore[return-value] + df, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + )[:2] + + _right: DataFrame | Series + if axis is not None: + axis = self._get_axis_number(axis) + if isinstance(other, ABCDataFrame): + left, _right, join_index = self._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + + elif isinstance(other, ABCSeries): + left, _right, join_index = self._align_series( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + right = cast(NDFrameT, _right) + if self.ndim == 1 or axis == 0: + # If we are aligning timezone-aware DatetimeIndexes and the timezones + # do not match, convert both to UTC. + if isinstance(left.index.dtype, DatetimeTZDtype): + if left.index.tz != right.index.tz: + if join_index is not None: + # GH#33671 copy to ensure we don't change the index on + # our original Series + left = left.copy(deep=False) + right = right.copy(deep=False) + left.index = join_index + right.index = join_index + + left = left.__finalize__(self) + right = right.__finalize__(other) + return left, right + + @final + def _align_frame( + self, + other: DataFrame, + join: AlignJoin = "outer", + axis: Axis | None = None, + level=None, + copy: bool_t | None = None, + fill_value=None, + method=None, + limit: int | None = None, + fill_axis: Axis = 0, + ) -> tuple[Self, DataFrame, Index | None]: + # defaults + join_index, join_columns = None, None + ilidx, iridx = None, None + clidx, cridx = None, None + + is_series = isinstance(self, ABCSeries) + + if (axis is None or axis == 0) and not self.index.equals(other.index): + join_index, ilidx, iridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) + + if ( + (axis is None or axis == 1) + and not is_series + and not self.columns.equals(other.columns) + ): + join_columns, clidx, cridx = self.columns.join( + other.columns, how=join, level=level, return_indexers=True + ) + + if is_series: + reindexers = {0: [join_index, ilidx]} + else: + reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]} + + left = self._reindex_with_indexers( + reindexers, copy=copy, fill_value=fill_value, allow_dups=True + ) + # other must be always DataFrame + right = other._reindex_with_indexers( + {0: [join_index, iridx], 1: [join_columns, cridx]}, + copy=copy, + fill_value=fill_value, + allow_dups=True, + ) + + if method is not None: + left = left._pad_or_backfill(method, axis=fill_axis, limit=limit) + right = right._pad_or_backfill(method, axis=fill_axis, limit=limit) + + return left, right, join_index + + @final + def _align_series( + self, + other: Series, + join: AlignJoin = "outer", + axis: Axis | None = None, + level=None, + copy: bool_t | None = None, + fill_value=None, + method=None, + limit: int | None = None, + fill_axis: Axis = 0, + ) -> tuple[Self, Series, Index | None]: + is_series = isinstance(self, ABCSeries) + if copy and using_copy_on_write(): + copy = False + + if (not is_series and axis is None) or axis not in [None, 0, 1]: + raise ValueError("Must specify axis=0 or 1") + + if is_series and axis == 1: + raise ValueError("cannot align series to a series other than axis 0") + + # series/series compat, other must always be a Series + if not axis: + # equal + if self.index.equals(other.index): + join_index, lidx, ridx = None, None, None + else: + join_index, lidx, ridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) + + if is_series: + left = self._reindex_indexer(join_index, lidx, copy) + elif lidx is None or join_index is None: + left = self.copy(deep=copy) + else: + new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy) + left = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + + right = other._reindex_indexer(join_index, ridx, copy) + + else: + # one has > 1 ndim + fdata = self._mgr + join_index = self.axes[1] + lidx, ridx = None, None + if not join_index.equals(other.index): + join_index, lidx, ridx = join_index.join( + other.index, how=join, level=level, return_indexers=True + ) + + if lidx is not None: + bm_axis = self._get_block_manager_axis(1) + fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis) + + if copy and fdata is self._mgr: + fdata = fdata.copy() + + left = self._constructor_from_mgr(fdata, axes=fdata.axes) + + if ridx is None: + right = other.copy(deep=copy) + else: + right = other.reindex(join_index, level=level) + + # fill + fill_na = notna(fill_value) or (method is not None) + if fill_na: + fill_value, method = validate_fillna_kwargs(fill_value, method) + if method is not None: + left = left._pad_or_backfill(method, limit=limit, axis=fill_axis) + right = right._pad_or_backfill(method, limit=limit) + else: + left = left.fillna(fill_value, limit=limit, axis=fill_axis) + right = right.fillna(fill_value, limit=limit) + + return left, right, join_index + + @final + def _where( + self, + cond, + other=lib.no_default, + inplace: bool_t = False, + axis: Axis | None = None, + level=None, + warn: bool_t = True, + ): + """ + Equivalent to public method `where`, except that `other` is not + applied as a function even if callable. Used in __setitem__. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + if axis is not None: + axis = self._get_axis_number(axis) + + # align the cond to same shape as myself + cond = common.apply_if_callable(cond, self) + if isinstance(cond, NDFrame): + # CoW: Make sure reference is not kept alive + if cond.ndim == 1 and self.ndim == 2: + cond = cond._constructor_expanddim( + {i: cond for i in range(len(self.columns))}, + copy=False, + ) + cond.columns = self.columns + cond = cond.align(self, join="right", copy=False)[0] + else: + if not hasattr(cond, "shape"): + cond = np.asanyarray(cond) + if cond.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + cond = self._constructor(cond, **self._construct_axes_dict(), copy=False) + + # make sure we are boolean + fill_value = bool(inplace) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + cond = cond.fillna(fill_value) + cond = cond.infer_objects(copy=False) + + msg = "Boolean array expected for the condition, not {dtype}" + + if not cond.empty: + if not isinstance(cond, ABCDataFrame): + # This is a single-dimensional object. + if not is_bool_dtype(cond): + raise ValueError(msg.format(dtype=cond.dtype)) + else: + for _dt in cond.dtypes: + if not is_bool_dtype(_dt): + raise ValueError(msg.format(dtype=_dt)) + if cond._mgr.any_extension_types: + # GH51574: avoid object ndarray conversion later on + cond = cond._constructor( + cond.to_numpy(dtype=bool, na_value=fill_value), + **cond._construct_axes_dict(), + ) + else: + # GH#21947 we have an empty DataFrame/Series, could be object-dtype + cond = cond.astype(bool) + + cond = -cond if inplace else cond + cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False) + + # try to align with other + if isinstance(other, NDFrame): + # align with me + if other.ndim <= self.ndim: + # CoW: Make sure reference is not kept alive + other = self.align( + other, + join="left", + axis=axis, + level=level, + fill_value=None, + copy=False, + )[1] + + # if we are NOT aligned, raise as we cannot where index + if axis is None and not other._indexed_same(self): + raise InvalidIndexError + + if other.ndim < self.ndim: + # TODO(EA2D): avoid object-dtype cast in EA case GH#38729 + other = other._values + if axis == 0: + other = np.reshape(other, (-1, 1)) + elif axis == 1: + other = np.reshape(other, (1, -1)) + + other = np.broadcast_to(other, self.shape) + + # slice me out of the other + else: + raise NotImplementedError( + "cannot align with a higher dimensional NDFrame" + ) + + elif not isinstance(other, (MultiIndex, NDFrame)): + # mainly just catching Index here + other = extract_array(other, extract_numpy=True) + + if isinstance(other, (np.ndarray, ExtensionArray)): + if other.shape != self.shape: + if self.ndim != 1: + # In the ndim == 1 case we may have + # other length 1, which we treat as scalar (GH#2745, GH#4192) + # or len(other) == icond.sum(), which we treat like + # __setitem__ (GH#3235) + raise ValueError( + "other must be the same shape as self when an ndarray" + ) + + # we are the same shape, so create an actual object for alignment + else: + other = self._constructor( + other, **self._construct_axes_dict(), copy=False + ) + + if axis is None: + axis = 0 + + if self.ndim == getattr(other, "ndim", 0): + align = True + else: + align = self._get_axis_number(axis) == 1 + + if inplace: + # we may have different type blocks come out of putmask, so + # reconstruct the block manager + + new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn) + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + return self._update_inplace(result) + + else: + new_data = self._mgr.where( + other=other, + cond=cond, + align=align, + ) + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + return result.__finalize__(self) + + @overload + def where( + self, + cond, + other=..., + *, + inplace: Literal[False] = ..., + axis: Axis | None = ..., + level: Level = ..., + ) -> Self: + ... + + @overload + def where( + self, + cond, + other=..., + *, + inplace: Literal[True], + axis: Axis | None = ..., + level: Level = ..., + ) -> None: + ... + + @overload + def where( + self, + cond, + other=..., + *, + inplace: bool_t = ..., + axis: Axis | None = ..., + level: Level = ..., + ) -> Self | None: + ... + + @final + @doc( + klass=_shared_doc_kwargs["klass"], + cond="True", + cond_rev="False", + name="where", + name_other="mask", + ) + def where( + self, + cond, + other=np.nan, + *, + inplace: bool_t = False, + axis: Axis | None = None, + level: Level | None = None, + ) -> Self | None: + """ + Replace values where the condition is {cond_rev}. + + Parameters + ---------- + cond : bool {klass}, array-like, or callable + Where `cond` is {cond}, keep the original value. Where + {cond_rev}, replace with corresponding value from `other`. + If `cond` is callable, it is computed on the {klass} and + should return boolean {klass} or array. The callable must + not change input {klass} (though pandas doesn't check it). + other : scalar, {klass}, or callable + Entries where `cond` is {cond_rev} are replaced with + corresponding value from `other`. + If other is callable, it is computed on the {klass} and + should return scalar or {klass}. The callable must not + change input {klass} (though pandas doesn't check it). + If not specified, entries will be filled with the corresponding + NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension + dtypes). + inplace : bool, default False + Whether to perform the operation in place on the data. + axis : int, default None + Alignment axis if needed. For `Series` this parameter is + unused and defaults to 0. + level : int, default None + Alignment level if needed. + + Returns + ------- + Same type as caller or None if ``inplace=True``. + + See Also + -------- + :func:`DataFrame.{name_other}` : Return an object of same shape as + self. + + Notes + ----- + The {name} method is an application of the if-then idiom. For each + element in the calling DataFrame, if ``cond`` is ``{cond}`` the + element is used; otherwise the corresponding element from the DataFrame + ``other`` is used. If the axis of ``other`` does not align with axis of + ``cond`` {klass}, the misaligned index positions will be filled with + {cond_rev}. + + The signature for :func:`DataFrame.where` differs from + :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to + ``np.where(m, df1, df2)``. + + For further details and examples see the ``{name}`` documentation in + :ref:`indexing `. + + The dtype of the object takes precedence. The fill value is casted to + the object's dtype, if this can be done losslessly. + + Examples + -------- + >>> s = pd.Series(range(5)) + >>> s.where(s > 0) + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + >>> s.mask(s > 0) + 0 0.0 + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: float64 + + >>> s = pd.Series(range(5)) + >>> t = pd.Series([True, False]) + >>> s.where(t, 99) + 0 0 + 1 99 + 2 99 + 3 99 + 4 99 + dtype: int64 + >>> s.mask(t, 99) + 0 99 + 1 1 + 2 99 + 3 99 + 4 99 + dtype: int64 + + >>> s.where(s > 1, 10) + 0 10 + 1 10 + 2 2 + 3 3 + 4 4 + dtype: int64 + >>> s.mask(s > 1, 10) + 0 0 + 1 1 + 2 10 + 3 10 + 4 10 + dtype: int64 + + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> df + A B + 0 0 1 + 1 2 3 + 2 4 5 + 3 6 7 + 4 8 9 + >>> m = df % 3 == 0 + >>> df.where(m, -df) + A B + 0 0 -1 + 1 -2 3 + 2 -4 -5 + 3 6 -7 + 4 -8 9 + >>> df.where(m, -df) == np.where(m, df, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + >>> df.where(m, -df) == df.mask(~m, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + other = common.apply_if_callable(other, self) + return self._where(cond, other, inplace, axis, level) + + @overload + def mask( + self, + cond, + other=..., + *, + inplace: Literal[False] = ..., + axis: Axis | None = ..., + level: Level = ..., + ) -> Self: + ... + + @overload + def mask( + self, + cond, + other=..., + *, + inplace: Literal[True], + axis: Axis | None = ..., + level: Level = ..., + ) -> None: + ... + + @overload + def mask( + self, + cond, + other=..., + *, + inplace: bool_t = ..., + axis: Axis | None = ..., + level: Level = ..., + ) -> Self | None: + ... + + @final + @doc( + where, + klass=_shared_doc_kwargs["klass"], + cond="False", + cond_rev="True", + name="mask", + name_other="where", + ) + def mask( + self, + cond, + other=lib.no_default, + *, + inplace: bool_t = False, + axis: Axis | None = None, + level: Level | None = None, + ) -> Self | None: + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + cond = common.apply_if_callable(cond, self) + other = common.apply_if_callable(other, self) + + # see gh-21891 + if not hasattr(cond, "__invert__"): + cond = np.array(cond) + + return self._where( + ~cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + ) + + @doc(klass=_shared_doc_kwargs["klass"]) + def shift( + self, + periods: int | Sequence[int] = 1, + freq=None, + axis: Axis = 0, + fill_value: Hashable = lib.no_default, + suffix: str | None = None, + ) -> Self | DataFrame: + """ + Shift index by desired number of periods with an optional time `freq`. + + When `freq` is not passed, shift the index without realigning the data. + If `freq` is passed (in this case, the index must be date or datetime, + or it will raise a `NotImplementedError`), the index will be + increased using the periods and the `freq`. `freq` can be inferred + when specified as "infer" as long as either freq or inferred_freq + attribute is set in the index. + + Parameters + ---------- + periods : int or Sequence + Number of periods to shift. Can be positive or negative. + If an iterable of ints, the data will be shifted once by each int. + This is equivalent to shifting by one value at a time and + concatenating all resulting frames. The resulting columns will have + the shift suffixed to their column names. For multiple periods, + axis must not be 1. + freq : DateOffset, tseries.offsets, timedelta, or str, optional + Offset to use from the tseries module or time rule (e.g. 'EOM'). + If `freq` is specified then the index values are shifted but the + data is not realigned. That is, use `freq` if you would like to + extend the index when shifting and preserve the original data. + If `freq` is specified as "infer" then it will be inferred from + the freq or inferred_freq attributes of the index. If neither of + those attributes exist, a ValueError is thrown. + axis : {{0 or 'index', 1 or 'columns', None}}, default None + Shift direction. For `Series` this parameter is unused and defaults to 0. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + the default depends on the dtype of `self`. + For numeric data, ``np.nan`` is used. + For datetime, timedelta, or period data, etc. :attr:`NaT` is used. + For extension dtypes, ``self.dtype.na_value`` is used. + suffix : str, optional + If str and periods is an iterable, this is added after the column + name and before the shift value for each shifted column name. + + Returns + ------- + {klass} + Copy of input object, shifted. + + See Also + -------- + Index.shift : Shift values of Index. + DatetimeIndex.shift : Shift values of DatetimeIndex. + PeriodIndex.shift : Shift values of PeriodIndex. + + Examples + -------- + >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45], + ... "Col2": [13, 23, 18, 33, 48], + ... "Col3": [17, 27, 22, 37, 52]}}, + ... index=pd.date_range("2020-01-01", "2020-01-05")) + >>> df + Col1 Col2 Col3 + 2020-01-01 10 13 17 + 2020-01-02 20 23 27 + 2020-01-03 15 18 22 + 2020-01-04 30 33 37 + 2020-01-05 45 48 52 + + >>> df.shift(periods=3) + Col1 Col2 Col3 + 2020-01-01 NaN NaN NaN + 2020-01-02 NaN NaN NaN + 2020-01-03 NaN NaN NaN + 2020-01-04 10.0 13.0 17.0 + 2020-01-05 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis="columns") + Col1 Col2 Col3 + 2020-01-01 NaN 10 13 + 2020-01-02 NaN 20 23 + 2020-01-03 NaN 15 18 + 2020-01-04 NaN 30 33 + 2020-01-05 NaN 45 48 + + >>> df.shift(periods=3, fill_value=0) + Col1 Col2 Col3 + 2020-01-01 0 0 0 + 2020-01-02 0 0 0 + 2020-01-03 0 0 0 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + + >>> df.shift(periods=3, freq="D") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 + + >>> df.shift(periods=3, freq="infer") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 + + >>> df['Col1'].shift(periods=[0, 1, 2]) + Col1_0 Col1_1 Col1_2 + 2020-01-01 10 NaN NaN + 2020-01-02 20 10.0 NaN + 2020-01-03 15 20.0 10.0 + 2020-01-04 30 15.0 20.0 + 2020-01-05 45 30.0 15.0 + """ + axis = self._get_axis_number(axis) + + if freq is not None and fill_value is not lib.no_default: + # GH#53832 + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + fill_value = lib.no_default + + if periods == 0: + return self.copy(deep=None) + + if is_list_like(periods) and isinstance(self, ABCSeries): + return self.to_frame().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) + periods = cast(int, periods) + + if freq is None: + # when freq is None, data is shifted, index is not + axis = self._get_axis_number(axis) + assert axis == 0 # axis == 1 cases handled in DataFrame.shift + new_data = self._mgr.shift(periods=periods, fill_value=fill_value) + return self._constructor_from_mgr( + new_data, axes=new_data.axes + ).__finalize__(self, method="shift") + + return self._shift_with_freq(periods, axis, freq) + + @final + def _shift_with_freq(self, periods: int, axis: int, freq) -> Self: + # see shift.__doc__ + # when freq is given, index is shifted, data is not + index = self._get_axis(axis) + + if freq == "infer": + freq = getattr(index, "freq", None) + + if freq is None: + freq = getattr(index, "inferred_freq", None) + + if freq is None: + msg = "Freq was not set in the index hence cannot be inferred" + raise ValueError(msg) + + elif isinstance(freq, str): + is_period = isinstance(index, PeriodIndex) + freq = to_offset(freq, is_period=is_period) + + if isinstance(index, PeriodIndex): + orig_freq = to_offset(index.freq) + if freq != orig_freq: + assert orig_freq is not None # for mypy + raise ValueError( + f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} " + f"does not match PeriodIndex freq " + f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}" + ) + new_ax = index.shift(periods) + else: + new_ax = index.shift(periods, freq) + + result = self.set_axis(new_ax, axis=axis) + return result.__finalize__(self, method="shift") + + @final + def truncate( + self, + before=None, + after=None, + axis: Axis | None = None, + copy: bool_t | None = None, + ) -> Self: + """ + Truncate a Series or DataFrame before and after some index value. + + This is a useful shorthand for boolean indexing based on index + values above or below certain thresholds. + + Parameters + ---------- + before : date, str, int + Truncate all rows before this index value. + after : date, str, int + Truncate all rows after this index value. + axis : {0 or 'index', 1 or 'columns'}, optional + Axis to truncate. Truncates the index (rows) by default. + For `Series` this parameter is unused and defaults to 0. + copy : bool, default is True, + Return a copy of the truncated section. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + type of caller + The truncated Series or DataFrame. + + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by label. + DataFrame.iloc : Select a subset of a DataFrame by position. + + Notes + ----- + If the index being truncated contains only datetime values, + `before` and `after` may be specified as strings instead of + Timestamps. + + Examples + -------- + >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'], + ... 'B': ['f', 'g', 'h', 'i', 'j'], + ... 'C': ['k', 'l', 'm', 'n', 'o']}, + ... index=[1, 2, 3, 4, 5]) + >>> df + A B C + 1 a f k + 2 b g l + 3 c h m + 4 d i n + 5 e j o + + >>> df.truncate(before=2, after=4) + A B C + 2 b g l + 3 c h m + 4 d i n + + The columns of a DataFrame can be truncated. + + >>> df.truncate(before="A", after="B", axis="columns") + A B + 1 a f + 2 b g + 3 c h + 4 d i + 5 e j + + For Series, only rows can be truncated. + + >>> df['A'].truncate(before=2, after=4) + 2 b + 3 c + 4 d + Name: A, dtype: object + + The index values in ``truncate`` can be datetimes or string + dates. + + >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s') + >>> df = pd.DataFrame(index=dates, data={'A': 1}) + >>> df.tail() + A + 2016-01-31 23:59:56 1 + 2016-01-31 23:59:57 1 + 2016-01-31 23:59:58 1 + 2016-01-31 23:59:59 1 + 2016-02-01 00:00:00 1 + + >>> df.truncate(before=pd.Timestamp('2016-01-05'), + ... after=pd.Timestamp('2016-01-10')).tail() + A + 2016-01-09 23:59:56 1 + 2016-01-09 23:59:57 1 + 2016-01-09 23:59:58 1 + 2016-01-09 23:59:59 1 + 2016-01-10 00:00:00 1 + + Because the index is a DatetimeIndex containing only dates, we can + specify `before` and `after` as strings. They will be coerced to + Timestamps before truncation. + + >>> df.truncate('2016-01-05', '2016-01-10').tail() + A + 2016-01-09 23:59:56 1 + 2016-01-09 23:59:57 1 + 2016-01-09 23:59:58 1 + 2016-01-09 23:59:59 1 + 2016-01-10 00:00:00 1 + + Note that ``truncate`` assumes a 0 value for any unspecified time + component (midnight). This differs from partial string slicing, which + returns any partially matching dates. + + >>> df.loc['2016-01-05':'2016-01-10', :].tail() + A + 2016-01-10 23:59:55 1 + 2016-01-10 23:59:56 1 + 2016-01-10 23:59:57 1 + 2016-01-10 23:59:58 1 + 2016-01-10 23:59:59 1 + """ + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + # GH 17935 + # Check that index is sorted + if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: + raise ValueError("truncate requires a sorted index") + + # if we have a date index, convert to dates, otherwise + # treat like a slice + if ax._is_all_dates: + from pandas.core.tools.datetimes import to_datetime + + before = to_datetime(before) + after = to_datetime(after) + + if before is not None and after is not None and before > after: + raise ValueError(f"Truncate: {after} must be after {before}") + + if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: + before, after = after, before + + slicer = [slice(None, None)] * self._AXIS_LEN + slicer[axis] = slice(before, after) + result = self.loc[tuple(slicer)] + + if isinstance(ax, MultiIndex): + setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) + + result = result.copy(deep=copy and not using_copy_on_write()) + + return result + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def tz_convert( + self, tz, axis: Axis = 0, level=None, copy: bool_t | None = None + ) -> Self: + """ + Convert tz-aware axis to target time zone. + + Parameters + ---------- + tz : str or tzinfo object or None + Target time zone. Passing ``None`` will convert to + UTC and remove the timezone information. + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to convert + level : int, str, default None + If axis is a MultiIndex, convert a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + {klass} + Object with time zone converted axis. + + Raises + ------ + TypeError + If the axis is tz-naive. + + Examples + -------- + Change to another time zone: + + >>> s = pd.Series( + ... [1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']), + ... ) + >>> s.tz_convert('Asia/Shanghai') + 2018-09-15 07:30:00+08:00 1 + dtype: int64 + + Pass None to convert to UTC and get a tz-naive index: + + >>> s = pd.Series([1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s.tz_convert(None) + 2018-09-14 23:30:00 1 + dtype: int64 + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + def _tz_convert(ax, tz): + if not hasattr(ax, "tz_convert"): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError( + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" + ) + ax = DatetimeIndex([], tz=tz) + else: + ax = ax.tz_convert(tz) + return ax + + # if a level is given it must be a MultiIndex level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + new_level = _tz_convert(ax.levels[level], tz) + ax = ax.set_levels(new_level, level=level) + else: + if level not in (None, 0, ax.name): + raise ValueError(f"The level {level} is not valid") + ax = _tz_convert(ax, tz) + + result = self.copy(deep=copy and not using_copy_on_write()) + result = result.set_axis(ax, axis=axis, copy=False) + return result.__finalize__(self, method="tz_convert") + + @final + @doc(klass=_shared_doc_kwargs["klass"]) + def tz_localize( + self, + tz, + axis: Axis = 0, + level=None, + copy: bool_t | None = None, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + """ + Localize tz-naive index of a Series or DataFrame to target time zone. + + This operation localizes the Index. To localize the values in a + timezone-naive Series, use :meth:`Series.dt.tz_localize`. + + Parameters + ---------- + tz : str or tzinfo or None + Time zone to localize. Passing ``None`` will remove the + time zone information and preserve local time. + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to localize + level : int, str, default None + If axis ia a MultiIndex, localize a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + nonexistent : str, default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. Valid values are: + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + {klass} + Same type as the input. + + Raises + ------ + TypeError + If the TimeSeries is tz-aware and tz is not None. + + Examples + -------- + Localize local times: + + >>> s = pd.Series( + ... [1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']), + ... ) + >>> s.tz_localize('CET') + 2018-09-15 01:30:00+02:00 1 + dtype: int64 + + Pass None to convert to tz-naive index and preserve local time: + + >>> s = pd.Series([1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s.tz_localize(None) + 2018-09-15 01:30:00 1 + dtype: int64 + + Be careful with DST changes. When there is sequential data, pandas + can infer the DST time: + + >>> s = pd.Series(range(7), + ... index=pd.DatetimeIndex(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.tz_localize('CET', ambiguous='infer') + 2018-10-28 01:30:00+02:00 0 + 2018-10-28 02:00:00+02:00 1 + 2018-10-28 02:30:00+02:00 2 + 2018-10-28 02:00:00+01:00 3 + 2018-10-28 02:30:00+01:00 4 + 2018-10-28 03:00:00+01:00 5 + 2018-10-28 03:30:00+01:00 6 + dtype: int64 + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.Series(range(3), + ... index=pd.DatetimeIndex(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) + 2018-10-28 01:20:00+02:00 0 + 2018-10-28 02:36:00+02:00 1 + 2018-10-28 03:46:00+01:00 2 + dtype: int64 + + If the DST transition causes nonexistent times, you can shift these + dates forward or backward with a timedelta object or `'shift_forward'` + or `'shift_backward'`. + + >>> s = pd.Series(range(2), + ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) + >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + 2015-03-29 03:00:00+02:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + 2015-03-29 01:59:59.999999999+01:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) + 2015-03-29 03:30:00+02:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + """ + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") + if nonexistent not in nonexistent_options and not isinstance( + nonexistent, dt.timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" + ) + + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + def _tz_localize(ax, tz, ambiguous, nonexistent): + if not hasattr(ax, "tz_localize"): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError( + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" + ) + ax = DatetimeIndex([], tz=tz) + else: + ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent) + return ax + + # if a level is given it must be a MultiIndex level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent) + ax = ax.set_levels(new_level, level=level) + else: + if level not in (None, 0, ax.name): + raise ValueError(f"The level {level} is not valid") + ax = _tz_localize(ax, tz, ambiguous, nonexistent) + + result = self.copy(deep=copy and not using_copy_on_write()) + result = result.set_axis(ax, axis=axis, copy=False) + return result.__finalize__(self, method="tz_localize") + + # ---------------------------------------------------------------------- + # Numeric Methods + + @final + def describe( + self, + percentiles=None, + include=None, + exclude=None, + ) -> Self: + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + Analyzes both numeric and object series, as well + as ``DataFrame`` column sets of mixed data types. The output + will vary depending on what is provided. Refer to the notes + below for more detail. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should + fall between 0 and 1. The default is + ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored + for ``Series``. Here are the options: + + - 'all' : All columns of the input will be included in the output. + - A list-like of dtypes : Limits the results to the + provided data types. + To limit the result to numeric types submit + ``numpy.number``. To limit it instead to object columns submit + the ``numpy.object`` data type. Strings + can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + select pandas categorical columns, use ``'category'`` + - None (default) : The result will include all numeric columns. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored + for ``Series``. Here are the options: + + - A list-like of dtypes : Excludes the provided data types + from the result. To exclude numeric types submit + ``numpy.number``. To exclude object columns submit the data + type ``numpy.object``. Strings can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To + exclude pandas categorical columns, use ``'category'`` + - None (default) : The result will exclude nothing. + + Returns + ------- + Series or DataFrame + Summary statistics of the Series or Dataframe provided. + + See Also + -------- + DataFrame.count: Count number of non-NA/null observations. + DataFrame.max: Maximum of the values in the object. + DataFrame.min: Minimum of the values in the object. + DataFrame.mean: Mean of the values. + DataFrame.std: Standard deviation of the observations. + DataFrame.select_dtypes: Subset of a DataFrame including/excluding + columns based on their dtype. + + Notes + ----- + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + + For object data (e.g. strings or timestamps), the result's index + will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` + is the most common value. The ``freq`` is the most common value's + frequency. Timestamps also include the ``first`` and ``last`` items. + + If multiple object values have the highest count, then the + ``count`` and ``top`` results will be arbitrarily chosen from + among those with the highest count. + + For mixed data types provided via a ``DataFrame``, the default is to + return only an analysis of numeric columns. If the dataframe consists + only of object and categorical data without any numeric columns, the + default is to return an analysis of both the object and categorical + columns. If ``include='all'`` is provided as an option, the result + will include a union of attributes of each type. + + The `include` and `exclude` parameters can be used to limit + which columns in a ``DataFrame`` are analyzed for the output. + The parameters are ignored when analyzing a ``Series``. + + Examples + -------- + Describing a numeric ``Series``. + + >>> s = pd.Series([1, 2, 3]) + >>> s.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + dtype: float64 + + Describing a categorical ``Series``. + + >>> s = pd.Series(['a', 'a', 'b', 'c']) + >>> s.describe() + count 4 + unique 3 + top a + freq 2 + dtype: object + + Describing a timestamp ``Series``. + + >>> s = pd.Series([ + ... np.datetime64("2000-01-01"), + ... np.datetime64("2010-01-01"), + ... np.datetime64("2010-01-01") + ... ]) + >>> s.describe() + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 + dtype: object + + Describing a ``DataFrame``. By default only numeric fields + are returned. + + >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), + ... 'numeric': [1, 2, 3], + ... 'object': ['a', 'b', 'c'] + ... }) + >>> df.describe() + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Describing all columns of a ``DataFrame`` regardless of data type. + + >>> df.describe(include='all') # doctest: +SKIP + categorical numeric object + count 3 3.0 3 + unique 3 NaN 3 + top f NaN a + freq 1 NaN 1 + mean NaN 2.0 NaN + std NaN 1.0 NaN + min NaN 1.0 NaN + 25% NaN 1.5 NaN + 50% NaN 2.0 NaN + 75% NaN 2.5 NaN + max NaN 3.0 NaN + + Describing a column from a ``DataFrame`` by accessing it as + an attribute. + + >>> df.numeric.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + Name: numeric, dtype: float64 + + Including only numeric columns in a ``DataFrame`` description. + + >>> df.describe(include=[np.number]) + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Including only string columns in a ``DataFrame`` description. + + >>> df.describe(include=[object]) # doctest: +SKIP + object + count 3 + unique 3 + top a + freq 1 + + Including only categorical columns from a ``DataFrame`` description. + + >>> df.describe(include=['category']) + categorical + count 3 + unique 3 + top d + freq 1 + + Excluding numeric columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[np.number]) # doctest: +SKIP + categorical object + count 3 3 + unique 3 3 + top f a + freq 1 1 + + Excluding object columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[object]) # doctest: +SKIP + categorical numeric + count 3 3.0 + unique 3 NaN + top f NaN + freq 1 NaN + mean NaN 2.0 + std NaN 1.0 + min NaN 1.0 + 25% NaN 1.5 + 50% NaN 2.0 + 75% NaN 2.5 + max NaN 3.0 + """ + return describe_ndframe( + obj=self, + include=include, + exclude=exclude, + percentiles=percentiles, + ).__finalize__(self, method="describe") + + @final + def pct_change( + self, + periods: int = 1, + fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, + limit: int | None | lib.NoDefault = lib.no_default, + freq=None, + **kwargs, + ) -> Self: + """ + Fractional change between the current and a prior element. + + Computes the fractional change from the immediately previous row by + default. This is useful in comparing the fraction of change in a time + series of elements. + + .. note:: + + Despite the name of this method, it calculates fractional change + (also known as per unit change or relative change) and not + percentage change. If you need the percentage change, multiply + these values by 100. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + How to handle NAs **before** computing percent changes. + + .. deprecated:: 2.1 + All options of `fill_method` are deprecated except `fill_method=None`. + + limit : int, default None + The number of consecutive NAs to fill before stopping. + + .. deprecated:: 2.1 + + freq : DateOffset, timedelta, or str, optional + Increment to use from time series API (e.g. 'ME' or BDay()). + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift` or `Series.shift`. + + Returns + ------- + Series or DataFrame + The same type as the calling object. + + See Also + -------- + Series.diff : Compute the difference of two elements in a Series. + DataFrame.diff : Compute the difference of two elements in a DataFrame. + Series.shift : Shift the index by some number of periods. + DataFrame.shift : Shift the index by some number of periods. + + Examples + -------- + **Series** + + >>> s = pd.Series([90, 91, 85]) + >>> s + 0 90 + 1 91 + 2 85 + dtype: int64 + + >>> s.pct_change() + 0 NaN + 1 0.011111 + 2 -0.065934 + dtype: float64 + + >>> s.pct_change(periods=2) + 0 NaN + 1 NaN + 2 -0.055556 + dtype: float64 + + See the percentage change in a Series where filling NAs with last + valid observation forward to next valid. + + >>> s = pd.Series([90, 91, None, 85]) + >>> s + 0 90.0 + 1 91.0 + 2 NaN + 3 85.0 + dtype: float64 + + >>> s.ffill().pct_change() + 0 NaN + 1 0.011111 + 2 0.000000 + 3 -0.065934 + dtype: float64 + + **DataFrame** + + Percentage change in French franc, Deutsche Mark, and Italian lira from + 1980-01-01 to 1980-03-01. + + >>> df = pd.DataFrame({ + ... 'FR': [4.0405, 4.0963, 4.3149], + ... 'GR': [1.7246, 1.7482, 1.8519], + ... 'IT': [804.74, 810.01, 860.13]}, + ... index=['1980-01-01', '1980-02-01', '1980-03-01']) + >>> df + FR GR IT + 1980-01-01 4.0405 1.7246 804.74 + 1980-02-01 4.0963 1.7482 810.01 + 1980-03-01 4.3149 1.8519 860.13 + + >>> df.pct_change() + FR GR IT + 1980-01-01 NaN NaN NaN + 1980-02-01 0.013810 0.013684 0.006549 + 1980-03-01 0.053365 0.059318 0.061876 + + Percentage of change in GOOG and APPL stock volume. Shows computing + the percentage change between columns. + + >>> df = pd.DataFrame({ + ... '2016': [1769950, 30586265], + ... '2015': [1500923, 40912316], + ... '2014': [1371819, 41403351]}, + ... index=['GOOG', 'APPL']) + >>> df + 2016 2015 2014 + GOOG 1769950 1500923 1371819 + APPL 30586265 40912316 41403351 + + >>> df.pct_change(axis='columns', periods=-1) + 2016 2015 2014 + GOOG 0.179241 0.094112 NaN + APPL -0.252395 -0.011860 NaN + """ + # GH#53491 + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: + warnings.warn( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if fill_method is lib.no_default: + if limit is lib.no_default: + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break + fill_method = "pad" + if limit is lib.no_default: + limit = None + + axis = self._get_axis_number(kwargs.pop("axis", "index")) + if fill_method is None: + data = self + else: + data = self._pad_or_backfill(fill_method, axis=axis, limit=limit) + + shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs) + # Unsupported left operand type for / ("Self") + rs = data / shifted - 1 # type: ignore[operator] + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) + return rs.__finalize__(self, method="pct_change") + + @final + def _logical_func( + self, + name: str, + func, + axis: Axis | None = 0, + bool_only: bool_t = False, + skipna: bool_t = True, + **kwargs, + ) -> Series | bool_t: + nv.validate_logical_func((), kwargs, fname=name) + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + + if self.ndim > 1 and axis is None: + # Reduce along one dimension then the other, to simplify DataFrame._reduce + res = self._logical_func( + name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs + ) + # error: Item "bool" of "Series | bool" has no attribute "_logical_func" + return res._logical_func( # type: ignore[union-attr] + name, func, skipna=skipna, **kwargs + ) + elif axis is None: + axis = 0 + + if ( + self.ndim > 1 + and axis == 1 + and len(self._mgr.arrays) > 1 + # TODO(EA2D): special-case not needed + and all(x.ndim == 2 for x in self._mgr.arrays) + and not kwargs + ): + # Fastpath avoiding potentially expensive transpose + obj = self + if bool_only: + obj = self._get_bool_data() + return obj._reduce_axis1(name, func, skipna=skipna) + + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", + ) + + def any( + self, + axis: Axis | None = 0, + bool_only: bool_t = False, + skipna: bool_t = True, + **kwargs, + ) -> Series | bool_t: + return self._logical_func( + "any", nanops.nanany, axis, bool_only, skipna, **kwargs + ) + + def all( + self, + axis: Axis = 0, + bool_only: bool_t = False, + skipna: bool_t = True, + **kwargs, + ) -> Series | bool_t: + return self._logical_func( + "all", nanops.nanall, axis, bool_only, skipna, **kwargs + ) + + @final + def _accum_func( + self, + name: str, + func, + axis: Axis | None = None, + skipna: bool_t = True, + *args, + **kwargs, + ): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) + if axis is None: + axis = 0 + else: + axis = self._get_axis_number(axis) + + if axis == 1: + return self.T._accum_func( + name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026 + ).T + + def block_accum_func(blk_values): + values = blk_values.T if hasattr(blk_values, "T") else blk_values + + result: np.ndarray | ExtensionArray + if isinstance(values, ExtensionArray): + result = values._accumulate(name, skipna=skipna, **kwargs) + else: + result = nanops.na_accum_func(values, func, skipna=skipna) + + result = result.T if hasattr(result, "T") else result + return result + + result = self._mgr.apply(block_accum_func) + + return self._constructor_from_mgr(result, axes=result.axes).__finalize__( + self, method=name + ) + + def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + return self._accum_func( + "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs + ) + + def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + return self._accum_func( + "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs + ) + + def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) + + def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) + + @final + def _stat_function_ddof( + self, + name: str, + func, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool_t = True, + ddof: int = 1, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + nv.validate_stat_ddof_func((), kwargs, fname=name) + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + + if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = 0 + elif axis is lib.no_default: + axis = 0 + + return self._reduce( + func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + ) + + def sem( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + ddof: int = 1, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function_ddof( + "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs + ) + + def var( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + ddof: int = 1, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function_ddof( + "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs + ) + + def std( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + ddof: int = 1, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function_ddof( + "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs + ) + + @final + def _stat_function( + self, + name: str, + func, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ): + assert name in ["median", "mean", "min", "max", "kurt", "skew"], name + nv.validate_func(name, (), kwargs) + + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + + return self._reduce( + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only + ) + + def min( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ): + return self._stat_function( + "min", + nanops.nanmin, + axis, + skipna, + numeric_only, + **kwargs, + ) + + def max( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ): + return self._stat_function( + "max", + nanops.nanmax, + axis, + skipna, + numeric_only, + **kwargs, + ) + + def mean( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function( + "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs + ) + + def median( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function( + "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs + ) + + def skew( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function( + "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs + ) + + def kurt( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + **kwargs, + ) -> Series | float: + return self._stat_function( + "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs + ) + + kurtosis = kurt + + @final + def _min_count_stat_function( + self, + name: str, + func, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool_t = True, + numeric_only: bool_t = False, + min_count: int = 0, + **kwargs, + ): + assert name in ["sum", "prod"], name + nv.validate_func(name, (), kwargs) + + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + + if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = 0 + elif axis is lib.no_default: + axis = 0 + + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + + def sum( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + min_count: int = 0, + **kwargs, + ): + return self._min_count_stat_function( + "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs + ) + + def prod( + self, + axis: Axis | None = 0, + skipna: bool_t = True, + numeric_only: bool_t = False, + min_count: int = 0, + **kwargs, + ): + return self._min_count_stat_function( + "prod", + nanops.nanprod, + axis, + skipna, + numeric_only, + min_count, + **kwargs, + ) + + product = prod + + @final + @doc(Rolling) + def rolling( + self, + window: int | dt.timedelta | str | BaseOffset | BaseIndexer, + min_periods: int | None = None, + center: bool_t = False, + win_type: str | None = None, + on: str | None = None, + axis: Axis | lib.NoDefault = lib.no_default, + closed: IntervalClosedType | None = None, + step: int | None = None, + method: str = "single", + ) -> Window | Rolling: + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + name = "rolling" + if axis == 1: + warnings.warn( + f"Support for axis=1 in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + f"Use obj.T.{name}(...) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + "Call the method without the axis keyword instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + step=step, + method=method, + ) + + return Rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + step=step, + method=method, + ) + + @final + @doc(Expanding) + def expanding( + self, + min_periods: int = 1, + axis: Axis | lib.NoDefault = lib.no_default, + method: Literal["single", "table"] = "single", + ) -> Expanding: + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + name = "expanding" + if axis == 1: + warnings.warn( + f"Support for axis=1 in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + f"Use obj.T.{name}(...) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + "Call the method without the axis keyword instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + return Expanding(self, min_periods=min_periods, axis=axis, method=method) + + @final + @doc(ExponentialMovingWindow) + def ewm( + self, + com: float | None = None, + span: float | None = None, + halflife: float | TimedeltaConvertibleTypes | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool_t = True, + ignore_na: bool_t = False, + axis: Axis | lib.NoDefault = lib.no_default, + times: np.ndarray | DataFrame | Series | None = None, + method: Literal["single", "table"] = "single", + ) -> ExponentialMovingWindow: + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + name = "ewm" + if axis == 1: + warnings.warn( + f"Support for axis=1 in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + f"Use obj.T.{name}(...) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + "Call the method without the axis keyword instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + + return ExponentialMovingWindow( + self, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + method=method, + ) + + # ---------------------------------------------------------------------- + # Arithmetic Methods + + @final + def _inplace_method(self, other, op) -> Self: + """ + Wrap arithmetic method to operate inplace. + """ + warn = True + if not PYPY and warn_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT + 2: + # we are probably in an inplace setitem context (e.g. df['a'] += 1) + warn = False + + result = op(self, other) + + if ( + self.ndim == 1 + and result._indexed_same(self) + and result.dtype == self.dtype + and not using_copy_on_write() + and not (warn_copy_on_write() and not warn) + ): + # GH#36498 this inplace op can _actually_ be inplace. + # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager, + # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace" + self._mgr.setitem_inplace( # type: ignore[union-attr] + slice(None), result._values, warn=warn + ) + return self + + # Delete cacher + self._reset_cacher() + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False), verify_is_copy=False + ) + return self + + @final + def __iadd__(self, other) -> Self: + # error: Unsupported left operand type for + ("Type[NDFrame]") + return self._inplace_method(other, type(self).__add__) # type: ignore[operator] + + @final + def __isub__(self, other) -> Self: + # error: Unsupported left operand type for - ("Type[NDFrame]") + return self._inplace_method(other, type(self).__sub__) # type: ignore[operator] + + @final + def __imul__(self, other) -> Self: + # error: Unsupported left operand type for * ("Type[NDFrame]") + return self._inplace_method(other, type(self).__mul__) # type: ignore[operator] + + @final + def __itruediv__(self, other) -> Self: + # error: Unsupported left operand type for / ("Type[NDFrame]") + return self._inplace_method( + other, type(self).__truediv__ # type: ignore[operator] + ) + + @final + def __ifloordiv__(self, other) -> Self: + # error: Unsupported left operand type for // ("Type[NDFrame]") + return self._inplace_method( + other, type(self).__floordiv__ # type: ignore[operator] + ) + + @final + def __imod__(self, other) -> Self: + # error: Unsupported left operand type for % ("Type[NDFrame]") + return self._inplace_method(other, type(self).__mod__) # type: ignore[operator] + + @final + def __ipow__(self, other) -> Self: + # error: Unsupported left operand type for ** ("Type[NDFrame]") + return self._inplace_method(other, type(self).__pow__) # type: ignore[operator] + + @final + def __iand__(self, other) -> Self: + # error: Unsupported left operand type for & ("Type[NDFrame]") + return self._inplace_method(other, type(self).__and__) # type: ignore[operator] + + @final + def __ior__(self, other) -> Self: + return self._inplace_method(other, type(self).__or__) + + @final + def __ixor__(self, other) -> Self: + # error: Unsupported left operand type for ^ ("Type[NDFrame]") + return self._inplace_method(other, type(self).__xor__) # type: ignore[operator] + + # ---------------------------------------------------------------------- + # Misc methods + + @final + def _find_valid_index(self, *, how: str) -> Hashable | None: + """ + Retrieves the index of the first valid value. + + Parameters + ---------- + how : {'first', 'last'} + Use this parameter to change between the first or last valid index. + + Returns + ------- + idx_first_valid : type of index + """ + is_valid = self.notna().values + idxpos = find_valid_index(how=how, is_valid=is_valid) + if idxpos is None: + return None + return self.index[idxpos] + + @final + @doc(position="first", klass=_shared_doc_kwargs["klass"]) + def first_valid_index(self) -> Hashable | None: + """ + Return index for {position} non-NA value or None, if no non-NA value is found. + + Returns + ------- + type of index + + Examples + -------- + For Series: + + >>> s = pd.Series([None, 3, 4]) + >>> s.first_valid_index() + 1 + >>> s.last_valid_index() + 2 + + >>> s = pd.Series([None, None]) + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If all elements in Series are NA/null, returns None. + + >>> s = pd.Series() + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If Series is empty, returns None. + + For DataFrame: + + >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}}) + >>> df + A B + 0 NaN NaN + 1 NaN 3.0 + 2 2.0 4.0 + >>> df.first_valid_index() + 1 + >>> df.last_valid_index() + 2 + + >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}}) + >>> df + A B + 0 None None + 1 None None + 2 None None + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If all elements in DataFrame are NA/null, returns None. + + >>> df = pd.DataFrame() + >>> df + Empty DataFrame + Columns: [] + Index: [] + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If DataFrame is empty, returns None. + """ + return self._find_valid_index(how="first") + + @final + @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"]) + def last_valid_index(self) -> Hashable | None: + return self._find_valid_index(how="last") + + +_num_doc = """ +{desc} + +Parameters +---------- +axis : {axis_descr} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + +skipna : bool, default True + Exclude NA/null values when computing the result. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + +{min_count}\ +**kwargs + Additional keyword arguments to be passed to the function. + +Returns +------- +{name1} or scalar\ +{see_also}\ +{examples} +""" + +_sum_prod_doc = """ +{desc} + +Parameters +---------- +axis : {axis_descr} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + +skipna : bool, default True + Exclude NA/null values when computing the result. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + +{min_count}\ +**kwargs + Additional keyword arguments to be passed to the function. + +Returns +------- +{name1} or scalar\ +{see_also}\ +{examples} +""" + +_num_ddof_doc = """ +{desc} + +Parameters +---------- +axis : {axis_descr} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + +Returns +------- +{name1} or {name2} (if level specified) \ +{notes}\ +{examples} +""" + +_std_notes = """ + +Notes +----- +To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the +default `ddof=1`)""" + +_std_examples = """ + +Examples +-------- +>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3], +... 'age': [21, 25, 62, 43], +... 'height': [1.61, 1.87, 1.49, 2.01]} +... ).set_index('person_id') +>>> df + age height +person_id +0 21 1.61 +1 25 1.87 +2 62 1.49 +3 43 2.01 + +The standard deviation of the columns can be found as follows: + +>>> df.std() +age 18.786076 +height 0.237417 +dtype: float64 + +Alternatively, `ddof=0` can be set to normalize by N instead of N-1: + +>>> df.std(ddof=0) +age 16.269219 +height 0.205609 +dtype: float64""" + +_var_examples = """ + +Examples +-------- +>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3], +... 'age': [21, 25, 62, 43], +... 'height': [1.61, 1.87, 1.49, 2.01]} +... ).set_index('person_id') +>>> df + age height +person_id +0 21 1.61 +1 25 1.87 +2 62 1.49 +3 43 2.01 + +>>> df.var() +age 352.916667 +height 0.056367 +dtype: float64 + +Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + +>>> df.var(ddof=0) +age 264.687500 +height 0.042275 +dtype: float64""" + +_bool_doc = """ +{desc} + +Parameters +---------- +axis : {{0 or 'index', 1 or 'columns', None}}, default 0 + Indicate which axis or axes should be reduced. For `Series` this parameter + is unused and defaults to 0. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + +bool_only : bool, default False + Include only boolean columns. Not implemented for Series. +skipna : bool, default True + Exclude NA/null values. If the entire row/column is NA and skipna is + True, then the result will be {empty_value}, as for an empty row/column. + If skipna is False, then NA are treated as True, because these are not + equal to zero. +**kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +{name1} or {name2} + If level is specified, then, {name2} is returned; otherwise, {name1} + is returned. + +{see_also} +{examples}""" + +_all_desc = """\ +Return whether all elements are True, potentially over an axis. + +Returns True unless there at least one element within a series or +along a Dataframe axis that is False or equivalent (e.g. zero or +empty).""" + +_all_examples = """\ +Examples +-------- +**Series** + +>>> pd.Series([True, True]).all() +True +>>> pd.Series([True, False]).all() +False +>>> pd.Series([], dtype="float64").all() +True +>>> pd.Series([np.nan]).all() +True +>>> pd.Series([np.nan]).all(skipna=False) +True + +**DataFrames** + +Create a dataframe from a dictionary. + +>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]}) +>>> df + col1 col2 +0 True True +1 True False + +Default behaviour checks if values in each column all return True. + +>>> df.all() +col1 True +col2 False +dtype: bool + +Specify ``axis='columns'`` to check if values in each row all return True. + +>>> df.all(axis='columns') +0 True +1 False +dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False +""" + +_all_see_also = """\ +See Also +-------- +Series.all : Return True if all elements are True. +DataFrame.any : Return True if one (or more) elements are True. +""" + +_cnum_doc = """ +Return cumulative {desc} over a DataFrame or Series axis. + +Returns a DataFrame or Series of the same size containing the cumulative +{desc}. + +Parameters +---------- +axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +*args, **kwargs + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +{name1} or {name2} + Return cumulative {desc} of {name1} or {name2}. + +See Also +-------- +core.window.expanding.Expanding.{accum_func_name} : Similar functionality + but ignores ``NaN`` values. +{name2}.{accum_func_name} : Return the {desc} over + {name2} axis. +{name2}.cummax : Return cumulative maximum over {name2} axis. +{name2}.cummin : Return cumulative minimum over {name2} axis. +{name2}.cumsum : Return cumulative sum over {name2} axis. +{name2}.cumprod : Return cumulative product over {name2} axis. + +{examples}""" + +_cummin_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cummin() +0 2.0 +1 NaN +2 2.0 +3 -1.0 +4 -1.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cummin(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the minimum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cummin() + A B +0 2.0 1.0 +1 2.0 NaN +2 1.0 0.0 + +To iterate over columns and find the minimum in each row, +use ``axis=1`` + +>>> df.cummin(axis=1) + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 +""" + +_cumsum_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cumsum() +0 2.0 +1 NaN +2 7.0 +3 6.0 +4 6.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cumsum(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the sum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cumsum() + A B +0 2.0 1.0 +1 5.0 NaN +2 6.0 1.0 + +To iterate over columns and find the sum in each row, +use ``axis=1`` + +>>> df.cumsum(axis=1) + A B +0 2.0 3.0 +1 3.0 NaN +2 1.0 1.0 +""" + +_cumprod_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cumprod() +0 2.0 +1 NaN +2 10.0 +3 -10.0 +4 -0.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cumprod(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the product +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cumprod() + A B +0 2.0 1.0 +1 6.0 NaN +2 6.0 0.0 + +To iterate over columns and find the product in each row, +use ``axis=1`` + +>>> df.cumprod(axis=1) + A B +0 2.0 2.0 +1 3.0 NaN +2 1.0 0.0 +""" + +_cummax_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cummax() +0 2.0 +1 NaN +2 5.0 +3 5.0 +4 5.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cummax(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the maximum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cummax() + A B +0 2.0 1.0 +1 3.0 NaN +2 3.0 1.0 + +To iterate over columns and find the maximum in each row, +use ``axis=1`` + +>>> df.cummax(axis=1) + A B +0 2.0 2.0 +1 3.0 NaN +2 1.0 1.0 +""" + +_any_see_also = """\ +See Also +-------- +numpy.any : Numpy version of this method. +Series.any : Return whether any element is True. +Series.all : Return whether all elements are True. +DataFrame.any : Return whether any element is True over requested axis. +DataFrame.all : Return whether all elements are True over requested axis. +""" + +_any_desc = """\ +Return whether any element is True, potentially over an axis. + +Returns False unless there is at least one element within a series or +along a Dataframe axis that is True or equivalent (e.g. non-zero or +non-empty).""" + +_any_examples = """\ +Examples +-------- +**Series** + +For Series input, the output is a scalar indicating whether any element +is True. + +>>> pd.Series([False, False]).any() +False +>>> pd.Series([True, False]).any() +True +>>> pd.Series([], dtype="float64").any() +False +>>> pd.Series([np.nan]).any() +False +>>> pd.Series([np.nan]).any(skipna=False) +True + +**DataFrame** + +Whether each column contains at least one True element (the default). + +>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]}) +>>> df + A B C +0 1 0 0 +1 2 2 0 + +>>> df.any() +A True +B True +C False +dtype: bool + +Aggregating over the columns. + +>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]}) +>>> df + A B +0 True 1 +1 False 2 + +>>> df.any(axis='columns') +0 True +1 True +dtype: bool + +>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]}) +>>> df + A B +0 True 1 +1 False 0 + +>>> df.any(axis='columns') +0 True +1 False +dtype: bool + +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + +`any` for an empty DataFrame is an empty Series. + +>>> pd.DataFrame([]).any() +Series([], dtype: bool) +""" + +_shared_docs[ + "stat_func_example" +] = """ + +Examples +-------- +>>> idx = pd.MultiIndex.from_arrays([ +... ['warm', 'warm', 'cold', 'cold'], +... ['dog', 'falcon', 'fish', 'spider']], +... names=['blooded', 'animal']) +>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx) +>>> s +blooded animal +warm dog 4 + falcon 2 +cold fish 0 + spider 8 +Name: legs, dtype: int64 + +>>> s.{stat_func}() +{default_output}""" + +_sum_examples = _shared_docs["stat_func_example"].format( + stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8 +) + +_sum_examples += """ + +By default, the sum of an empty or all-NA Series is ``0``. + +>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default +0.0 + +This can be controlled with the ``min_count`` parameter. For example, if +you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + +>>> pd.Series([], dtype="float64").sum(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).sum() +0.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan""" + +_max_examples: str = _shared_docs["stat_func_example"].format( + stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8 +) + +_min_examples: str = _shared_docs["stat_func_example"].format( + stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 +) + +_stat_func_see_also = """ + +See Also +-------- +Series.sum : Return the sum. +Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.""" + +_prod_examples = """ + +Examples +-------- +By default, the product of an empty or all-NA Series is ``1`` + +>>> pd.Series([], dtype="float64").prod() +1.0 + +This can be controlled with the ``min_count`` parameter + +>>> pd.Series([], dtype="float64").prod(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).prod() +1.0 + +>>> pd.Series([np.nan]).prod(min_count=1) +nan""" + +_min_count_stub = """\ +min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. +""" + + +def make_doc(name: str, ndim: int) -> str: + """ + Generate the docstring for a Series/DataFrame reduction. + """ + if ndim == 1: + name1 = "scalar" + name2 = "Series" + axis_descr = "{index (0)}" + else: + name1 = "Series" + name2 = "DataFrame" + axis_descr = "{index (0), columns (1)}" + + if name == "any": + base_doc = _bool_doc + desc = _any_desc + see_also = _any_see_also + examples = _any_examples + kwargs = {"empty_value": "False"} + elif name == "all": + base_doc = _bool_doc + desc = _all_desc + see_also = _all_see_also + examples = _all_examples + kwargs = {"empty_value": "True"} + elif name == "min": + base_doc = _num_doc + desc = ( + "Return the minimum of the values over the requested axis.\n\n" + "If you want the *index* of the minimum, use ``idxmin``. This is " + "the equivalent of the ``numpy.ndarray`` method ``argmin``." + ) + see_also = _stat_func_see_also + examples = _min_examples + kwargs = {"min_count": ""} + elif name == "max": + base_doc = _num_doc + desc = ( + "Return the maximum of the values over the requested axis.\n\n" + "If you want the *index* of the maximum, use ``idxmax``. This is " + "the equivalent of the ``numpy.ndarray`` method ``argmax``." + ) + see_also = _stat_func_see_also + examples = _max_examples + kwargs = {"min_count": ""} + + elif name == "sum": + base_doc = _sum_prod_doc + desc = ( + "Return the sum of the values over the requested axis.\n\n" + "This is equivalent to the method ``numpy.sum``." + ) + see_also = _stat_func_see_also + examples = _sum_examples + kwargs = {"min_count": _min_count_stub} + + elif name == "prod": + base_doc = _sum_prod_doc + desc = "Return the product of the values over the requested axis." + see_also = _stat_func_see_also + examples = _prod_examples + kwargs = {"min_count": _min_count_stub} + + elif name == "median": + base_doc = _num_doc + desc = "Return the median of the values over the requested axis." + see_also = "" + examples = """ + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.median() + 2.0 + + With a DataFrame + + >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra']) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.median() + a 1.5 + b 2.5 + dtype: float64 + + Using axis=1 + + >>> df.median(axis=1) + tiger 1.5 + zebra 2.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']}, + ... index=['tiger', 'zebra']) + >>> df.median(numeric_only=True) + a 1.5 + dtype: float64""" + kwargs = {"min_count": ""} + + elif name == "mean": + base_doc = _num_doc + desc = "Return the mean of the values over the requested axis." + see_also = "" + examples = """ + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.mean() + 2.0 + + With a DataFrame + + >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra']) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.mean() + a 1.5 + b 2.5 + dtype: float64 + + Using axis=1 + + >>> df.mean(axis=1) + tiger 1.5 + zebra 2.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` to avoid + getting an error. + + >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']}, + ... index=['tiger', 'zebra']) + >>> df.mean(numeric_only=True) + a 1.5 + dtype: float64""" + kwargs = {"min_count": ""} + + elif name == "var": + base_doc = _num_ddof_doc + desc = ( + "Return unbiased variance over requested axis.\n\nNormalized by " + "N-1 by default. This can be changed using the ddof argument." + ) + examples = _var_examples + see_also = "" + kwargs = {"notes": ""} + + elif name == "std": + base_doc = _num_ddof_doc + desc = ( + "Return sample standard deviation over requested axis." + "\n\nNormalized by N-1 by default. This can be changed using the " + "ddof argument." + ) + examples = _std_examples + see_also = "" + kwargs = {"notes": _std_notes} + + elif name == "sem": + base_doc = _num_ddof_doc + desc = ( + "Return unbiased standard error of the mean over requested " + "axis.\n\nNormalized by N-1 by default. This can be changed " + "using the ddof argument" + ) + examples = """ + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.sem().round(6) + 0.57735 + + With a DataFrame + + >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra']) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.sem() + a 0.5 + b 0.5 + dtype: float64 + + Using axis=1 + + >>> df.sem(axis=1) + tiger 0.5 + zebra 0.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']}, + ... index=['tiger', 'zebra']) + >>> df.sem(numeric_only=True) + a 0.5 + dtype: float64""" + see_also = "" + kwargs = {"notes": ""} + + elif name == "skew": + base_doc = _num_doc + desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1." + see_also = "" + examples = """ + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.skew() + 0.0 + + With a DataFrame + + >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]}, + ... index=['tiger', 'zebra', 'cow']) + >>> df + a b c + tiger 1 2 1 + zebra 2 3 3 + cow 3 4 5 + >>> df.skew() + a 0.0 + b 0.0 + c 0.0 + dtype: float64 + + Using axis=1 + + >>> df.skew(axis=1) + tiger 1.732051 + zebra -1.732051 + cow 0.000000 + dtype: float64 + + In this case, `numeric_only` should be set to `True` to avoid + getting an error. + + >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']}, + ... index=['tiger', 'zebra', 'cow']) + >>> df.skew(numeric_only=True) + a 0.0 + dtype: float64""" + kwargs = {"min_count": ""} + elif name == "kurt": + base_doc = _num_doc + desc = ( + "Return unbiased kurtosis over requested axis.\n\n" + "Kurtosis obtained using Fisher's definition of\n" + "kurtosis (kurtosis of normal == 0.0). Normalized " + "by N-1." + ) + see_also = "" + examples = """ + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + + With a DataFrame + + >>> df = pd.DataFrame({'a': [1, 2, 2, 3], 'b': [3, 4, 4, 4]}, + ... index=['cat', 'dog', 'dog', 'mouse']) + >>> df + a b + cat 1 3 + dog 2 4 + dog 2 4 + mouse 3 4 + >>> df.kurt() + a 1.5 + b 4.0 + dtype: float64 + + With axis=None + + >>> df.kurt(axis=None).round(6) + -0.988693 + + Using axis=1 + + >>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [3, 4], 'd': [1, 2]}, + ... index=['cat', 'dog']) + >>> df.kurt(axis=1) + cat -6.0 + dog -6.0 + dtype: float64""" + kwargs = {"min_count": ""} + + elif name == "cumsum": + base_doc = _cnum_doc + desc = "sum" + see_also = "" + examples = _cumsum_examples + kwargs = {"accum_func_name": "sum"} + + elif name == "cumprod": + base_doc = _cnum_doc + desc = "product" + see_also = "" + examples = _cumprod_examples + kwargs = {"accum_func_name": "prod"} + + elif name == "cummin": + base_doc = _cnum_doc + desc = "minimum" + see_also = "" + examples = _cummin_examples + kwargs = {"accum_func_name": "min"} + + elif name == "cummax": + base_doc = _cnum_doc + desc = "maximum" + see_also = "" + examples = _cummax_examples + kwargs = {"accum_func_name": "max"} + + else: + raise NotImplementedError + + docstr = base_doc.format( + desc=desc, + name=name, + name1=name1, + name2=name2, + axis_descr=axis_descr, + see_also=see_also, + examples=examples, + **kwargs, + ) + return docstr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..869e511fc0720c1d98c0baadbf648914d2500fed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexing.py @@ -0,0 +1,2785 @@ +from __future__ import annotations + +from contextlib import suppress +import sys +from typing import ( + TYPE_CHECKING, + Any, + TypeVar, + cast, + final, +) +import warnings + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs.indexing import NDFrameIndexerBase +from pandas._libs.lib import item_from_zerodim +from pandas.compat import PYPY +from pandas.errors import ( + AbstractMethodError, + ChainedAssignmentError, + IndexingError, + InvalidIndexError, + LossySetitemError, + _chained_assignment_msg, + _chained_assignment_warning_msg, + _check_cacher, +) +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + can_hold_element, + maybe_promote, +) +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_sequence, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, + infer_fill_value, + is_valid_na_for_dtype, + isna, + na_value_for_dtype, +) + +from pandas.core import algorithms as algos +import pandas.core.common as com +from pandas.core.construction import ( + array as pd_array, + extract_array, +) +from pandas.core.indexers import ( + check_array_indexer, + is_list_like_indexer, + is_scalar_indexer, + length_of_indexer, +) +from pandas.core.indexes.api import ( + Index, + MultiIndex, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + + from pandas._typing import ( + Axis, + AxisInt, + Self, + npt, + ) + + from pandas import ( + DataFrame, + Series, + ) + +T = TypeVar("T") +# "null slice" +_NS = slice(None, None) +_one_ellipsis_message = "indexer may only contain one '...' entry" + + +# the public IndexSlicerMaker +class _IndexSlice: + """ + Create an object to more easily perform multi-index slicing. + + See Also + -------- + MultiIndex.remove_unused_levels : New MultiIndex with no unused levels. + + Notes + ----- + See :ref:`Defined Levels ` + for further info on slicing a MultiIndex. + + Examples + -------- + >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) + >>> columns = ['foo', 'bar'] + >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), + ... index=midx, columns=columns) + + Using the default slice command: + + >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + + Using the IndexSlice class for a more intuitive command: + + >>> idx = pd.IndexSlice + >>> dfmi.loc[idx[:, 'B0':'B1'], :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + """ + + def __getitem__(self, arg): + return arg + + +IndexSlice = _IndexSlice() + + +class IndexingMixin: + """ + Mixin for adding .loc/.iloc/.at/.iat to Dataframes and Series. + """ + + @property + def iloc(self) -> _iLocIndexer: + """ + Purely integer-location based indexing for selection by position. + + .. deprecated:: 2.2.0 + + Returning a tuple from a callable is deprecated. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on + some value. + - A tuple of row and column indexes. The tuple elements consist of one of the + above inputs, e.g. ``(0, 1)``. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> _LocIndexer: + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - An alignable boolean Series. The index of the key will be aligned before + masking. + - An alignable Index. The Index of the returned selection will be the input. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label `. + + Raises + ------ + KeyError + If any items are not found. + IndexingError + If an indexed key is passed and its index is unalignable to the frame index. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Alignable boolean Series: + + >>> df.loc[pd.Series([False, True, False], + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield + sidewinder 7 8 + + Index (same behavior as ``df.reindex``) + + >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] + max_speed shield + foo + cobra 1 2 + viper 4 5 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Multiple conditional using ``&`` that returns a boolean Series + + >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] + max_speed shield + viper 4 5 + + Multiple conditional using ``|`` that returns a boolean Series + + >>> df.loc[(df['max_speed'] > 4) | (df['shield'] < 5)] + max_speed shield + cobra 1 2 + sidewinder 7 8 + + Please ensure that each condition is wrapped in parentheses ``()``. + See the :ref:`user guide` + for more details and explanations of Boolean indexing. + + .. note:: + If you find yourself using 3 or more conditionals in ``.loc[]``, + consider using :ref:`advanced indexing`. + + See below for using ``.loc[]`` on MultiIndex DataFrames. + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + Add value matching location + + >>> df.loc["viper", "shield"] += 5 + >>> df + max_speed shield + cobra 30 10 + viper 0 5 + sidewinder 0 0 + + Setting using a ``Series`` or a ``DataFrame`` sets the values matching the + index labels, not the index positions. + + >>> shuffled_df = df.loc[["viper", "cobra", "sidewinder"]] + >>> df.loc[:] += shuffled_df + >>> df + max_speed shield + cobra 60 20 + viper 0 10 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + + Please see the :ref:`user guide` + for more details and explanations of advanced indexing. + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> _AtIndexer: + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If getting a value and 'label' does not exist in a DataFrame or Series. + + ValueError + If row/column label pair is not a tuple or if any label + from the pair is not a scalar for DataFrame. + If label is list-like (*excluding* NamedTuple) for Series. + + See Also + -------- + DataFrame.at : Access a single value for a row/column pair by label. + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer + position(s). + Series.at : Access a single value by label. + Series.iat : Access a single value by integer position. + Series.loc : Access a group of rows by label(s). + Series.iloc : Access a group of rows by integer position(s). + + Notes + ----- + See :ref:`Fast scalar value getting and setting ` + for more details. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> _iAtIndexer: + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + +class _LocationIndexer(NDFrameIndexerBase): + _valid_types: str + axis: AxisInt | None = None + + # sub-classes need to set _takeable + _takeable: bool + + @final + def __call__(self, axis: Axis | None = None) -> Self: + # we need to return a copy of ourselves + new_self = type(self)(self.name, self.obj) + + if axis is not None: + axis_int_none = self.obj._get_axis_number(axis) + else: + axis_int_none = axis + new_self.axis = axis_int_none + return new_self + + def _get_setitem_indexer(self, key): + """ + Convert a potentially-label-based key into a positional indexer. + """ + if self.name == "loc": + # always holds here bc iloc overrides _get_setitem_indexer + self._ensure_listlike_indexer(key) + + if isinstance(key, tuple): + for x in key: + check_dict_or_set_indexers(x) + + if self.axis is not None: + key = _tupleize_axis_indexer(self.ndim, self.axis, key) + + ax = self.obj._get_axis(0) + + if ( + isinstance(ax, MultiIndex) + and self.name != "iloc" + and is_hashable(key) + and not isinstance(key, slice) + ): + with suppress(KeyError, InvalidIndexError): + # TypeError e.g. passed a bool + return ax.get_loc(key) + + if isinstance(key, tuple): + with suppress(IndexingError): + # suppress "Too many indexers" + return self._convert_tuple(key) + + if isinstance(key, range): + # GH#45479 test_loc_setitem_range_key + key = list(key) + + return self._convert_to_indexer(key, axis=0) + + @final + def _maybe_mask_setitem_value(self, indexer, value): + """ + If we have obj.iloc[mask] = series_or_frame and series_or_frame has the + same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask], + similar to Series.__setitem__. + + Note this is only for loc, not iloc. + """ + + if ( + isinstance(indexer, tuple) + and len(indexer) == 2 + and isinstance(value, (ABCSeries, ABCDataFrame)) + ): + pi, icols = indexer + ndim = value.ndim + if com.is_bool_indexer(pi) and len(value) == len(pi): + newkey = pi.nonzero()[0] + + if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: + # e.g. test_loc_setitem_boolean_mask_allfalse + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) + indexer = (newkey, icols) + + elif ( + isinstance(icols, np.ndarray) + and icols.dtype.kind == "i" + and len(icols) == 1 + ): + if ndim == 1: + # We implicitly broadcast, though numpy does not, see + # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825 + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) + indexer = (newkey, icols) + + elif ndim == 2 and value.shape[1] == 1: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_frame(indexer, value) + indexer = (newkey, icols) + elif com.is_bool_indexer(indexer): + indexer = indexer.nonzero()[0] + + return indexer, value + + @final + def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : list-like of column labels + Target labels. + axis : key axis if known + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + if isinstance(key, tuple) and len(key) > 1: + # key may be a tuple if we are .loc + # if length of key is > 1 set key to column part + key = key[column_axis] + axis = column_axis + + if ( + axis == column_axis + and not isinstance(self.obj.columns, MultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + # GH#38148 + keys = self.obj.columns.union(key, sort=False) + diff = Index(key).difference(self.obj.columns, sort=False) + + if len(diff): + # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" + # is a new column, add the new columns with dtype=np.void + # so that later when we go through setitem_single_column + # we will use isetitem. Without this, the reindex_axis + # below would create float64 columns in this example, which + # would successfully hold 7, so we would end up with the wrong + # dtype. + indexer = np.arange(len(keys), dtype=np.intp) + indexer[len(self.obj.columns) :] = -1 + new_mgr = self.obj._mgr.reindex_indexer( + keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True + ) + self.obj._mgr = new_mgr + return + + self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True) + + @final + def __setitem__(self, key, value) -> None: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self.obj) <= 2: + warnings.warn( + _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 + ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self.obj) + ref_count = 2 + if not warn_copy_on_write() and _check_cacher(self.obj): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) + + check_dict_or_set_indexers(key) + if isinstance(key, tuple): + key = tuple(list(x) if is_iterator(x) else x for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + maybe_callable = com.apply_if_callable(key, self.obj) + key = self._check_deprecated_callable_usage(key, maybe_callable) + indexer = self._get_setitem_indexer(key) + self._has_valid_setitem_indexer(key) + + iloc = self if self.name == "iloc" else self.obj.iloc + iloc._setitem_with_indexer(indexer, value, self.name) + + def _validate_key(self, key, axis: AxisInt): + """ + Ensure that key is valid for current indexer. + + Parameters + ---------- + key : scalar, slice or list-like + Key requested. + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + TypeError + If the key (or some element of it) has wrong type. + IndexError + If the key (or some element of it) is out of bounds. + KeyError + If the key was not found. + """ + raise AbstractMethodError(self) + + @final + def _expand_ellipsis(self, tup: tuple) -> tuple: + """ + If a tuple key includes an Ellipsis, replace it with an appropriate + number of null slices. + """ + if any(x is Ellipsis for x in tup): + if tup.count(Ellipsis) > 1: + raise IndexingError(_one_ellipsis_message) + + if len(tup) == self.ndim: + # It is unambiguous what axis this Ellipsis is indexing, + # treat as a single null slice. + i = tup.index(Ellipsis) + # FIXME: this assumes only one Ellipsis + new_key = tup[:i] + (_NS,) + tup[i + 1 :] + return new_key + + # TODO: other cases? only one test gets here, and that is covered + # by _validate_key_length + return tup + + @final + def _validate_tuple_indexer(self, key: tuple) -> tuple: + """ + Check the key for valid keys across my indexer. + """ + key = self._validate_key_length(key) + key = self._expand_ellipsis(key) + for i, k in enumerate(key): + try: + self._validate_key(k, i) + except ValueError as err: + raise ValueError( + "Location based indexing can only have " + f"[{self._valid_types}] types" + ) from err + return key + + @final + def _is_nested_tuple_indexer(self, tup: tuple) -> bool: + """ + Returns + ------- + bool + """ + if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) + return False + + @final + def _convert_tuple(self, key: tuple) -> tuple: + # Note: we assume _tupleize_axis_indexer has been called, if necessary. + self._validate_key_length(key) + keyidx = [self._convert_to_indexer(k, axis=i) for i, k in enumerate(key)] + return tuple(keyidx) + + @final + def _validate_key_length(self, key: tuple) -> tuple: + if len(key) > self.ndim: + if key[0] is Ellipsis: + # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3] + key = key[1:] + if Ellipsis in key: + raise IndexingError(_one_ellipsis_message) + return self._validate_key_length(key) + raise IndexingError("Too many indexers") + return key + + @final + def _getitem_tuple_same_dim(self, tup: tuple): + """ + Index with indexers that should return an object of the same dimension + as self.obj. + + This is only called after a failed call to _getitem_lowerdim. + """ + retval = self.obj + # Selecting columns before rows is significantly faster + start_val = (self.ndim - len(tup)) + 1 + for i, key in enumerate(reversed(tup)): + i = self.ndim - i - start_val + if com.is_null_slice(key): + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=i) + # We should never have retval.ndim < self.ndim, as that should + # be handled by the _getitem_lowerdim call above. + assert retval.ndim == self.ndim + + if retval is self.obj: + # if all axes were a null slice (`df.loc[:, :]`), ensure we still + # return a new object (https://github.com/pandas-dev/pandas/pull/49469) + retval = retval.copy(deep=False) + + return retval + + @final + def _getitem_lowerdim(self, tup: tuple): + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis) + + # we may have a nested tuples indexer here + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here + ax0 = self.obj._get_axis(0) + # ...but iloc should handle the tuple as simple integer-location + # instead of checking it as multiindex representation (GH 13797) + if ( + isinstance(ax0, MultiIndex) + and self.name != "iloc" + and not any(isinstance(x, slice) for x in tup) + ): + # Note: in all extant test cases, replacing the slice condition with + # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)` + # is equivalent. + # (see the other place where we call _handle_lowerdim_multi_index_axis0) + with suppress(IndexingError): + return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(tup) + + tup = self._validate_key_length(tup) + + for i, key in enumerate(tup): + if is_label_like(key): + # We don't need to check for tuples here because those are + # caught by the _is_nested_tuple_indexer check above. + section = self._getitem_axis(key, axis=i) + + # We should never have a scalar section here, because + # _getitem_lowerdim is only called after a check for + # is_scalar_access, which that would be. + if section.ndim == self.ndim: + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS + new_key = tup[:i] + (_NS,) + tup[i + 1 :] + + else: + # Note: the section.ndim == self.ndim check above + # rules out having DataFrame here, so we dont need to worry + # about transposing. + new_key = tup[:i] + tup[i + 1 :] + + if len(new_key) == 1: + new_key = new_key[0] + + # Slices should return views, but calling iloc/loc with a null + # slice returns a new object. + if com.is_null_slice(new_key): + return section + # This is an elided recursive call to iloc/loc + return getattr(section, self.name)[new_key] + + raise IndexingError("not applicable") + + @final + def _getitem_nested_tuple(self, tup: tuple): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionality here + + def _contains_slice(x: object) -> bool: + # Check if object is a slice or a tuple containing a slice + if isinstance(x, tuple): + return any(isinstance(v, slice) for v in x) + elif isinstance(x, slice): + return True + return False + + for key in tup: + check_dict_or_set_indexers(key) + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + if self.name != "loc": + # This should never be reached, but let's be explicit about it + raise ValueError("Too many indices") # pragma: no cover + if all( + (is_hashable(x) and not _contains_slice(x)) or com.is_null_slice(x) + for x in tup + ): + # GH#10521 Series should reduce MultiIndex dimensions instead of + # DataFrame, IndexingError is not raised when slice(None,None,None) + # with one row. + with suppress(IndexingError): + return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0( + tup + ) + elif isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in tup + ): + # GH#35349 Raise if tuple in tuple for series + # Do this after the all-hashable-or-null-slice check so that + # we are only getting non-hashable tuples, in particular ones + # that themselves contain a slice entry + # See test_loc_series_getitem_too_many_dimensions + raise IndexingError("Too many indexers") + + # this is a series with a multi-index specified a tuple of + # selectors + axis = self.axis or 0 + return self._getitem_axis(tup, axis=axis) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + # GH#41369 Loop in reverse order ensures indexing along columns before rows + # which selects only necessary blocks which avoids dtype conversion if possible + axis = len(tup) - 1 + for key in tup[::-1]: + if com.is_null_slice(key): + axis -= 1 + continue + + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) + axis -= 1 + + # if we have a scalar, we are done + if is_scalar(obj) or not hasattr(obj, "ndim"): + break + + return obj + + def _convert_to_indexer(self, key, axis: AxisInt): + raise AbstractMethodError(self) + + def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: + # GH53533 + if self.name == "iloc" and callable(key) and isinstance(maybe_callable, tuple): + warnings.warn( + "Returning a tuple from a callable with iloc " + "is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=find_stack_level(), + ) + return maybe_callable + + @final + def __getitem__(self, key): + check_dict_or_set_indexers(key) + if type(key) is tuple: + key = tuple(list(x) if is_iterator(x) else x for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + if self._is_scalar_access(key): + return self.obj._get_value(*key, takeable=self._takeable) + return self._getitem_tuple(key) + else: + # we by definition only have the 0th axis + axis = self.axis or 0 + + maybe_callable = com.apply_if_callable(key, self.obj) + maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) + return self._getitem_axis(maybe_callable, axis=axis) + + def _is_scalar_access(self, key: tuple): + raise NotImplementedError() + + def _getitem_tuple(self, tup: tuple): + raise AbstractMethodError(self) + + def _getitem_axis(self, key, axis: AxisInt): + raise NotImplementedError() + + def _has_valid_setitem_indexer(self, indexer) -> bool: + raise AbstractMethodError(self) + + @final + def _getbool_axis(self, key, axis: AxisInt): + # caller is responsible for ensuring non-None axis + labels = self.obj._get_axis(axis) + key = check_bool_indexer(labels, key) + inds = key.nonzero()[0] + return self.obj._take_with_is_copy(inds, axis=axis) + + +@doc(IndexingMixin.loc) +class _LocIndexer(_LocationIndexer): + _takeable: bool = False + _valid_types = ( + "labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean" + ) + + # ------------------------------------------------------------------- + # Key Checks + + @doc(_LocationIndexer._validate_key) + def _validate_key(self, key, axis: Axis): + # valid for a collection of labels (we check their presence later) + # slice of labels (where start-end in labels) + # slice of integers (only if in the labels) + # boolean not in slice and with boolean index + ax = self.obj._get_axis(axis) + if isinstance(key, bool) and not ( + is_bool_dtype(ax.dtype) + or ax.dtype.name == "boolean" + or isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ): + raise KeyError( + f"{key}: boolean label can not be used without a boolean index" + ) + + if isinstance(key, slice) and ( + isinstance(key.start, bool) or isinstance(key.stop, bool) + ): + raise TypeError(f"{key}: boolean values can not be used in a slice") + + def _has_valid_setitem_indexer(self, indexer) -> bool: + return True + + def _is_scalar_access(self, key: tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_scalar(k): + return False + + ax = self.obj.axes[i] + if isinstance(ax, MultiIndex): + return False + + if isinstance(k, str) and ax._supports_partial_string_indexing: + # partial string indexing, df.loc['2000', 'A'] + # should not be considered scalar + return False + + if not ax._index_as_unique: + return False + + return True + + # ------------------------------------------------------------------- + # MultiIndex Handling + + def _multi_take_opportunity(self, tup: tuple) -> bool: + """ + Check whether there is the possibility to use ``_multi_take``. + + Currently the limit is that all axes being indexed, must be indexed with + list-likes. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + bool + Whether the current indexing, + can be passed through `_multi_take`. + """ + if not all(is_list_like_indexer(x) for x in tup): + return False + + # just too complicated + return not any(com.is_bool_indexer(x) for x in tup) + + def _multi_take(self, tup: tuple): + """ + Create the indexers for the passed tuple of keys, and + executes the take operation. This allows the take operation to be + executed all at once, rather than once for each dimension. + Improving efficiency. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + values: same type as the object being indexed + """ + # GH 836 + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, self.obj._AXIS_ORDERS) + } + return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True) + + # ------------------------------------------------------------------- + + def _getitem_iterable(self, key, axis: AxisInt): + """ + Index current object with an iterable collection of keys. + + Parameters + ---------- + key : iterable + Targeted labels. + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + + Returns + ------- + scalar, DataFrame, or Series: indexed value(s). + """ + # we assume that not com.is_bool_indexer(key), as that is + # handled before we get here. + self._validate_key(key, axis) + + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) + + def _getitem_tuple(self, tup: tuple): + with suppress(IndexingError): + tup = self._expand_ellipsis(tup) + return self._getitem_lowerdim(tup) + + # no multi-index, so validate all of the indexers + tup = self._validate_tuple_indexer(tup) + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + return self._getitem_tuple_same_dim(tup) + + def _get_label(self, label, axis: AxisInt): + # GH#5567 this will fail if the label is not present in the axis. + return self.obj.xs(label, axis=axis) + + def _handle_lowerdim_multi_index_axis0(self, tup: tuple): + # we have an axis0 multi-index, handle or raise + axis = self.axis or 0 + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=axis) + + except KeyError as ek: + # raise KeyError if number of indexers match + # else IndexingError will be raised + if self.ndim < len(tup) <= self.obj.index.nlevels: + raise ek + raise IndexingError("No label returned") from ek + + def _getitem_axis(self, key, axis: AxisInt): + key = item_from_zerodim(key) + if is_iterator(key): + key = list(key) + if key is Ellipsis: + key = slice(None) + + labels = self.obj._get_axis(axis) + + if isinstance(key, tuple) and isinstance(labels, MultiIndex): + key = tuple(key) + + if isinstance(key, slice): + self._validate_key(key, axis) + return self._get_slice_axis(key, axis=axis) + elif com.is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif is_list_like_indexer(key): + # an iterable multi-selection + if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") + + return self._getitem_iterable(key, axis=axis) + + # nested tuple slicing + if is_nested_tuple(key, labels): + locs = labels.get_locs(key) + indexer: list[slice | npt.NDArray[np.intp]] = [slice(None)] * self.ndim + indexer[axis] = locs + return self.obj.iloc[tuple(indexer)] + + # fall thru to straight lookup + self._validate_key(key, axis) + return self._get_label(key, axis=axis) + + def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): + """ + This is pretty simple as we just have to deal with labels. + """ + # caller is responsible for ensuring non-None axis + obj = self.obj + if not need_slice(slice_obj): + return obj.copy(deep=False) + + labels = obj._get_axis(axis) + indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step) + + if isinstance(indexer, slice): + return self.obj._slice(indexer, axis=axis) + else: + # DatetimeIndex overrides Index.slice_indexer and may + # return a DatetimeIndex instead of a slice object. + return self.obj.take(indexer, axis=axis) + + def _convert_to_indexer(self, key, axis: AxisInt): + """ + Convert indexing key into something we can use to do actual fancy + indexing on a ndarray. + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + 'In the face of ambiguity, refuse the temptation to guess.' + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + return labels._convert_slice_indexer(key, kind="loc") + + if ( + isinstance(key, tuple) + and not isinstance(labels, MultiIndex) + and self.ndim < 2 + and len(key) > 1 + ): + raise IndexingError("Too many indexers") + + # Slices are not valid keys passed in by the user, + # even though they are hashable in Python 3.12 + contains_slice = False + if isinstance(key, tuple): + contains_slice = any(isinstance(v, slice) for v in key) + + if is_scalar(key) or ( + isinstance(labels, MultiIndex) and is_hashable(key) and not contains_slice + ): + # Otherwise get_loc will raise InvalidIndexError + + # if we are a label return me + try: + return labels.get_loc(key) + except LookupError: + if isinstance(key, tuple) and isinstance(labels, MultiIndex): + if len(key) == labels.nlevels: + return {"key": key} + raise + except InvalidIndexError: + # GH35015, using datetime as column indices raises exception + if not isinstance(labels, MultiIndex): + raise + except ValueError: + if not is_integer(key): + raise + return {"key": key} + + if is_nested_tuple(key, labels): + if self.ndim == 1 and any(isinstance(k, tuple) for k in key): + # GH#35349 Raise if tuple in tuple for series + raise IndexingError("Too many indexers") + return labels.get_locs(key) + + elif is_list_like_indexer(key): + if is_iterator(key): + key = list(key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(labels, key) + return key + else: + return self._get_listlike_indexer(key, axis)[1] + else: + try: + return labels.get_loc(key) + except LookupError: + # allow a not found key only if we are a setter + if not is_list_like_indexer(key): + return {"key": key} + raise + + def _get_listlike_indexer(self, key, axis: AxisInt): + """ + Transform a list-like of keys into a new index and an indexer. + + Parameters + ---------- + key : list-like + Targeted labels. + axis: int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If at least one key was requested but none was found. + + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique). + values : array-like + Indexer for the return object, -1 denotes keys not found. + """ + ax = self.obj._get_axis(axis) + axis_name = self.obj._get_axis_name(axis) + + keyarr, indexer = ax._get_indexer_strict(key, axis_name) + + return keyarr, indexer + + +@doc(IndexingMixin.iloc) +class _iLocIndexer(_LocationIndexer): + _valid_types = ( + "integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array" + ) + _takeable = True + + # ------------------------------------------------------------------- + # Key Checks + + def _validate_key(self, key, axis: AxisInt): + if com.is_bool_indexer(key): + if hasattr(key, "index") and isinstance(key.index, Index): + if key.index.inferred_type == "integer": + raise NotImplementedError( + "iLocation based boolean " + "indexing on an integer type " + "is not available" + ) + raise ValueError( + "iLocation based boolean indexing cannot use " + "an indexable as a mask" + ) + return + + if isinstance(key, slice): + return + elif is_integer(key): + self._validate_integer(key, axis) + elif isinstance(key, tuple): + # a tuple should already have been caught by this point + # so don't treat a tuple as a valid indexer + raise IndexingError("Too many indexers") + elif is_list_like_indexer(key): + if isinstance(key, ABCSeries): + arr = key._values + elif is_array_like(key): + arr = key + else: + arr = np.array(key) + len_axis = len(self.obj._get_axis(axis)) + + # check that the key has a numeric dtype + if not is_numeric_dtype(arr.dtype): + raise IndexError(f".iloc requires numeric indexers, got {arr}") + + # check that the key does not exceed the maximum size of the index + if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): + raise IndexError("positional indexers are out-of-bounds") + else: + raise ValueError(f"Can only index by location with a [{self._valid_types}]") + + def _has_valid_setitem_indexer(self, indexer) -> bool: + """ + Validate that a positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally. + + Returns + ------- + bool + """ + if isinstance(indexer, dict): + raise IndexError("iloc cannot enlarge its target object") + + if isinstance(indexer, ABCDataFrame): + raise TypeError( + "DataFrame indexer for .iloc is not supported. " + "Consider using .loc with a DataFrame indexer for automatic alignment.", + ) + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): + raise IndexError("iloc cannot enlarge its target object") + elif isinstance(i, dict): + raise IndexError("iloc cannot enlarge its target object") + + return True + + def _is_scalar_access(self, key: tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + return all(is_integer(k) for k in key) + + def _validate_integer(self, key: int | np.integer, axis: AxisInt) -> None: + """ + Check that 'key' is a valid position in the desired axis. + + Parameters + ---------- + key : int + Requested position. + axis : int + Desired axis. + + Raises + ------ + IndexError + If 'key' is not a valid position in axis 'axis'. + """ + len_axis = len(self.obj._get_axis(axis)) + if key >= len_axis or key < -len_axis: + raise IndexError("single positional indexer is out-of-bounds") + + # ------------------------------------------------------------------- + + def _getitem_tuple(self, tup: tuple): + tup = self._validate_tuple_indexer(tup) + with suppress(IndexingError): + return self._getitem_lowerdim(tup) + + return self._getitem_tuple_same_dim(tup) + + def _get_list_axis(self, key, axis: AxisInt): + """ + Return Series values by list or array of integers. + + Parameters + ---------- + key : list-like positional indexer + axis : int + + Returns + ------- + Series object + + Notes + ----- + `axis` can only be zero. + """ + try: + return self.obj._take_with_is_copy(key, axis=axis) + except IndexError as err: + # re-raise with different error message, e.g. test_getitem_ndarray_3d + raise IndexError("positional indexers are out-of-bounds") from err + + def _getitem_axis(self, key, axis: AxisInt): + if key is Ellipsis: + key = slice(None) + elif isinstance(key, ABCDataFrame): + raise IndexError( + "DataFrame indexer is not allowed for .iloc\n" + "Consider using .loc for automatic alignment." + ) + + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + + if is_iterator(key): + key = list(key) + + if isinstance(key, list): + key = np.asarray(key) + + if com.is_bool_indexer(key): + self._validate_key(key, axis) + return self._getbool_axis(key, axis=axis) + + # a list of integers + elif is_list_like_indexer(key): + return self._get_list_axis(key, axis=axis) + + # a single integer + else: + key = item_from_zerodim(key) + if not is_integer(key): + raise TypeError("Cannot index by location index with a non-integer key") + + # validate the location + self._validate_integer(key, axis) + + return self.obj._ixs(key, axis=axis) + + def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): + # caller is responsible for ensuring non-None axis + obj = self.obj + + if not need_slice(slice_obj): + return obj.copy(deep=False) + + labels = obj._get_axis(axis) + labels._validate_positional_slice(slice_obj) + return self.obj._slice(slice_obj, axis=axis) + + def _convert_to_indexer(self, key, axis: AxisInt): + """ + Much simpler as we only have to deal with our valid types. + """ + return key + + def _get_setitem_indexer(self, key): + # GH#32257 Fall through to let numpy do validation + if is_iterator(key): + key = list(key) + + if self.axis is not None: + key = _tupleize_axis_indexer(self.ndim, self.axis, key) + + return key + + # ------------------------------------------------------------------- + + def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): + """ + _setitem_with_indexer is for setting values on a Series/DataFrame + using positional indexers. + + If the relevant keys are not present, the Series/DataFrame may be + expanded. + + This method is currently broken when dealing with non-unique Indexes, + since it goes from positional indexers back to labels when calling + BlockManager methods, see GH#12991, GH#22046, GH#15686. + """ + info_axis = self.obj._info_axis_number + + # maybe partial set + take_split_path = not self.obj._mgr.is_single_block + + if not take_split_path and isinstance(value, ABCDataFrame): + # Avoid cast of values + take_split_path = not value._mgr.is_single_block + + # if there is only one block/type, still have to take split path + # unless the block is one-dimensional or it can hold the value + if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + # in case of dict, keys are indices + val = list(value.values()) if isinstance(value, dict) else value + arr = self.obj._mgr.arrays[0] + take_split_path = not can_hold_element( + arr, extract_array(val, extract_numpy=True) + ) + + # if we have any multi-indexes that have non-trivial slices + # (not null slices) then we must take the split path, xref + # GH 10360, GH 27841 + if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): + for i, ax in zip(indexer, self.obj.axes): + if isinstance(ax, MultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): + take_split_path = True + break + + if isinstance(indexer, tuple): + nindexer = [] + for i, idx in enumerate(indexer): + if isinstance(idx, dict): + # reindex the axis to the new value + # and set inplace + key, _ = convert_missing_indexer(idx) + + # if this is the items axes, then take the main missing + # path first + # this correctly sets the dtype and avoids cache issues + # essentially this separates out the block that is needed + # to possibly be modified + if self.ndim > 1 and i == info_axis: + # add the new item, and set the value + # must have all defined axes if we have a scalar + # or a list-like on the non-info axes if we have a + # list-like + if not len(self.obj): + if not is_list_like_indexer(value): + raise ValueError( + "cannot set a frame with no " + "defined index and a scalar" + ) + self.obj[key] = value + return + + # add a new item with the dtype setup + if com.is_null_slice(indexer[0]): + # We are setting an entire column + self.obj[key] = value + return + elif is_array_like(value): + # GH#42099 + arr = extract_array(value, extract_numpy=True) + taker = -1 * np.ones(len(self.obj), dtype=np.intp) + empty_value = algos.take_nd(arr, taker) + if not isinstance(value, ABCSeries): + # if not Series (in which case we need to align), + # we can short-circuit + if ( + isinstance(arr, np.ndarray) + and arr.ndim == 1 + and len(arr) == 1 + ): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + arr = arr[0, ...] + empty_value[indexer[0]] = arr + self.obj[key] = empty_value + return + + self.obj[key] = empty_value + elif not is_list_like(value): + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) + else: + # FIXME: GH#42099#issuecomment-864326014 + self.obj[key] = infer_fill_value(value) + + new_indexer = convert_from_missing_indexer_tuple( + indexer, self.obj.axes + ) + self._setitem_with_indexer(new_indexer, value, name) + + return + + # reindex the axis + # make sure to clear the cache because we are + # just replacing the block manager here + # so the object is the same + index = self.obj._get_axis(i) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype " + "is deprecated", + category=FutureWarning, + ) + labels = index.insert(len(index), key) + + # We are expanding the Series/DataFrame values to match + # the length of thenew index `labels`. GH#40096 ensure + # this is valid even if the index has duplicates. + taker = np.arange(len(index) + 1, dtype=np.intp) + taker[-1] = -1 + reindexers = {i: (labels, taker)} + new_obj = self.obj._reindex_with_indexers( + reindexers, allow_dups=True + ) + self.obj._mgr = new_obj._mgr + self.obj._maybe_update_cacher(clear=True) + self.obj._is_copy = None + + nindexer.append(labels.get_loc(key)) + + else: + nindexer.append(idx) + + indexer = tuple(nindexer) + else: + indexer, missing = convert_missing_indexer(indexer) + + if missing: + self._setitem_with_indexer_missing(indexer, value) + return + + if name == "loc": + # must come after setting of missing + indexer, value = self._maybe_mask_setitem_value(indexer, value) + + # align and set the values + if take_split_path: + # We have to operate column-wise + self._setitem_with_indexer_split_path(indexer, value, name) + else: + self._setitem_single_block(indexer, value, name) + + def _setitem_with_indexer_split_path(self, indexer, value, name: str): + """ + Setitem column-wise. + """ + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + if len(indexer) > self.ndim: + raise IndexError("too many indices for array") + if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: + raise ValueError(r"Cannot set values with ndim > 2") + + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + from pandas import Series + + value = self._align_series(indexer, Series(value)) + + # Ensure we have something we can iterate over + info_axis = indexer[1] + ilocs = self._ensure_iterable_column_indexer(info_axis) + + pi = indexer[0] + lplane_indexer = length_of_indexer(pi, self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] + + # we need an iterable, with a ndim of at least 1 + # eg. don't pass through np.array(0) + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: + if isinstance(value, ABCDataFrame): + self._setitem_with_indexer_frame_value(indexer, value, name) + + elif np.ndim(value) == 2: + # TODO: avoid np.ndim call in case it isn't an ndarray, since + # that will construct an ndarray, which will be wasteful + self._setitem_with_indexer_2d_value(indexer, value) + + elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): + # We are setting multiple rows in a single column. + self._setitem_single_column(ilocs[0], value, pi) + + elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): + # We are trying to set N values into M entries of a single + # column, which is invalid for N != M + # Exclude zero-len for e.g. boolean masking that is all-false + + if len(value) == 1 and not is_integer(info_axis): + # This is a case like df.iloc[:3, [1]] = [0] + # where we treat as df.iloc[:3, 1] = 0 + return self._setitem_with_indexer((pi, info_axis[0]), value[0]) + + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + elif lplane_indexer == 0 and len(value) == len(self.obj.index): + # We get here in one case via .loc with a all-False mask + pass + + elif self._is_scalar_access(indexer) and is_object_dtype( + self.obj.dtypes._values[ilocs[0]] + ): + # We are setting nested data, only possible for object dtype data + self._setitem_single_column(indexer[1], value, pi) + + elif len(ilocs) == len(value): + # We are setting multiple columns in a single row. + for loc, v in zip(ilocs, value): + self._setitem_single_column(loc, v, pi) + + elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: + # This is a setitem-with-expansion, see + # test_loc_setitem_empty_append_expands_rows_mixed_dtype + # e.g. df = DataFrame(columns=["x", "y"]) + # df["x"] = df["x"].astype(np.int64) + # df.loc[:, "x"] = [1, 2, 3] + self._setitem_single_column(ilocs[0], value, pi) + + else: + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + else: + # scalar value + for loc in ilocs: + self._setitem_single_column(loc, value, pi) + + def _setitem_with_indexer_2d_value(self, indexer, value): + # We get here with np.ndim(value) == 2, excluding DataFrame, + # which goes through _setitem_with_indexer_frame_value + pi = indexer[0] + + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + if not is_array_like(value): + # cast lists to array + value = np.array(value, dtype=object) + if len(ilocs) != value.shape[1]: + raise ValueError( + "Must have equal len keys and value when setting with an ndarray" + ) + + for i, loc in enumerate(ilocs): + value_col = value[:, i] + if is_object_dtype(value_col.dtype): + # casting to list so that we do type inference in setitem_single_column + value_col = value_col.tolist() + self._setitem_single_column(loc, value_col, pi) + + def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + sub_indexer = list(indexer) + pi = indexer[0] + + multiindex_indexer = isinstance(self.obj.columns, MultiIndex) + + unique_cols = value.columns.is_unique + + # We do not want to align the value in case of iloc GH#37728 + if name == "iloc": + for i, loc in enumerate(ilocs): + val = value.iloc[:, i] + self._setitem_single_column(loc, val, pi) + + elif not unique_cols and value.columns.equals(self.obj.columns): + # We assume we are already aligned, see + # test_iloc_setitem_frame_duplicate_columns_multiple_blocks + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), + value.iloc[:, loc], + multiindex_indexer, + ) + else: + val = np.nan + + self._setitem_single_column(loc, val, pi) + + elif not unique_cols: + raise ValueError("Setting with non-unique columns is not allowed.") + + else: + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), + value[item], + multiindex_indexer, + using_cow=using_copy_on_write(), + ) + else: + val = np.nan + + self._setitem_single_column(loc, val, pi) + + def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: + """ + + Parameters + ---------- + loc : int + Indexer for column position + plane_indexer : int, slice, listlike[int] + The indexer we use for setitem along axis=0. + """ + pi = plane_indexer + + is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) + + is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + + if is_null_setter: + # no-op, don't cast dtype later + return + + elif is_full_setter: + try: + self.obj._mgr.column_setitem( + loc, plane_indexer, value, inplace_only=True + ) + except (ValueError, TypeError, LossySetitemError): + # If we're setting an entire column and we can't do it inplace, + # then we can use value's dtype (or inferred dtype) + # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to warn for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with {dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) + self.obj.isetitem(loc, value) + else: + # set value into the column (first attempting to operate inplace, then + # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) + self.obj._mgr.column_setitem(loc, plane_indexer, value) + + self.obj._clear_item_cache() + + def _setitem_single_block(self, indexer, value, name: str) -> None: + """ + _setitem_with_indexer for the case when we have a single Block. + """ + from pandas import Series + + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + info_axis = self.obj._info_axis_number + item_labels = self.obj._get_axis(info_axis) + if isinstance(indexer, tuple): + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + self.ndim == len(indexer) == 2 + and is_integer(indexer[1]) + and com.is_null_slice(indexer[0]) + ): + col = item_labels[indexer[info_axis]] + if len(item_labels.get_indexer_for([col])) == 1: + # e.g. test_loc_setitem_empty_append_expands_rows + loc = item_labels.get_loc(col) + self._setitem_single_column(loc, value, indexer[0]) + return + + indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align + + if isinstance(value, ABCDataFrame) and name != "iloc": + value = self._align_frame(indexer, value)._values + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True, inplace=True) + + def _setitem_with_indexer_missing(self, indexer, value): + """ + Insert new row(s) or column(s) into the Series or DataFrame. + """ + from pandas import Series + + # reindex the axis to the new value + # and set inplace + if self.ndim == 1: + index = self.obj.index + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_index = index.insert(len(index), indexer) + + # we have a coerced indexer, e.g. a float + # that matches in an int64 Index, so + # we will not create a duplicate index, rather + # index to that element + # e.g. 0.0 -> 0 + # GH#12246 + if index.is_unique: + # pass new_index[-1:] instead if [new_index[-1]] + # so that we retain dtype + new_indexer = index.get_indexer(new_index[-1:]) + if (new_indexer != -1).any(): + # We get only here with loc, so can hard code + return self._setitem_with_indexer(new_indexer, value, "loc") + + # this preserves dtype of the value and of the object + if not is_scalar(value): + new_dtype = None + + elif is_valid_na_for_dtype(value, self.obj.dtype): + if not is_object_dtype(self.obj.dtype): + # Every NA value is suitable for object, no conversion needed + value = na_value_for_dtype(self.obj.dtype, compat=False) + + new_dtype = maybe_promote(self.obj.dtype, value)[0] + + elif isna(value): + new_dtype = None + elif not self.obj.empty and not is_object_dtype(self.obj.dtype): + # We should not cast, if we have object dtype because we can + # set timedeltas into object series + curr_dtype = self.obj.dtype + curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) + new_dtype = maybe_promote(curr_dtype, value)[0] + else: + new_dtype = None + + new_values = Series([value], dtype=new_dtype)._values + + if len(self.obj._values): + # GH#22717 handle casting compatibility that np.concatenate + # does incorrectly + new_values = concat_compat([self.obj._values, new_values]) + self.obj._mgr = self.obj._constructor( + new_values, index=new_index, name=self.obj.name + )._mgr + self.obj._maybe_update_cacher(clear=True) + + elif self.ndim == 2: + if not len(self.obj.columns): + # no columns and scalar + raise ValueError("cannot set a frame with no defined columns") + + has_dtype = hasattr(value, "dtype") + if isinstance(value, ABCSeries): + # append a Series + value = value.reindex(index=self.obj.columns, copy=True) + value.name = indexer + elif isinstance(value, dict): + value = Series( + value, index=self.obj.columns, name=indexer, dtype=object + ) + else: + # a list-list + if is_list_like_indexer(value): + # must have conforming columns + if len(value) != len(self.obj.columns): + raise ValueError("cannot set a row with mismatched columns") + + value = Series(value, index=self.obj.columns, name=indexer) + + if not len(self.obj): + # We will ignore the existing dtypes instead of using + # internals.concat logic + df = value.to_frame().T + + idx = self.obj.index + if isinstance(idx, MultiIndex): + name = idx.names + else: + name = idx.name + + df.index = Index([indexer], name=name) + if not has_dtype: + # i.e. if we already had a Series or ndarray, keep that + # dtype. But if we had a list or dict, then do inference + df = df.infer_objects(copy=False) + self.obj._mgr = df._mgr + else: + self.obj._mgr = self.obj._append(value)._mgr + self.obj._maybe_update_cacher(clear=True) + + def _ensure_iterable_column_indexer(self, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + ilocs: Sequence[int | np.integer] | np.ndarray + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ilocs = np.arange(len(self.obj.columns))[column_indexer] + elif ( + isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b" + ): + ilocs = np.arange(len(column_indexer))[column_indexer] + else: + ilocs = column_indexer + return ilocs + + def _align_series( + self, + indexer, + ser: Series, + multiindex_indexer: bool = False, + using_cow: bool = False, + ): + """ + Parameters + ---------- + indexer : tuple, slice, scalar + Indexer used to get the locations that will be set to `ser`. + ser : pd.Series + Values to assign to the locations specified by `indexer`. + multiindex_indexer : bool, optional + Defaults to False. Should be set to True if `indexer` was from + a `pd.MultiIndex`, to avoid unnecessary broadcasting. + + Returns + ------- + `np.array` of `ser` broadcast to the appropriate shape for assignment + to the locations selected by `indexer` + """ + if isinstance(indexer, (slice, np.ndarray, list, Index)): + indexer = (indexer,) + + if isinstance(indexer, tuple): + # flatten np.ndarray indexers + def ravel(i): + return i.ravel() if isinstance(i, np.ndarray) else i + + indexer = tuple(map(ravel, indexer)) + + aligners = [not com.is_null_slice(idx) for idx in indexer] + sum_aligners = sum(aligners) + single_aligner = sum_aligners == 1 + is_frame = self.ndim == 2 + obj = self.obj + + # are we a single alignable value on a non-primary + # dim (e.g. panel: 1,2, or frame: 0) ? + # hence need to align to a single axis dimension + # rather that find all valid dims + + # frame + if is_frame: + single_aligner = single_aligner and aligners[0] + + # we have a frame, with multiple indexers on both axes; and a + # series, so need to broadcast (see GH5206) + if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): + ser_values = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values + + # single indexer + if len(indexer) > 1 and not multiindex_indexer: + len_indexer = len(indexer[1]) + ser_values = ( + np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T + ) + + return ser_values + + for i, idx in enumerate(indexer): + ax = obj.axes[i] + + # multiple aligners (or null slices) + if is_sequence(idx) or isinstance(idx, slice): + if single_aligner and com.is_null_slice(idx): + continue + new_ix = ax[idx] + if not is_list_like_indexer(new_ix): + new_ix = Index([new_ix]) + else: + new_ix = Index(new_ix) + if ser.index.equals(new_ix): + if using_cow: + return ser + return ser._values.copy() + + return ser.reindex(new_ix)._values + + # 2 dims + elif single_aligner: + # reindex along index + ax = self.obj.axes[1] + if ser.index.equals(ax) or not len(ax): + return ser._values.copy() + return ser.reindex(ax)._values + + elif is_integer(indexer) and self.ndim == 1: + if is_object_dtype(self.obj.dtype): + return ser + ax = self.obj._get_axis(0) + + if ser.index.equals(ax): + return ser._values.copy() + + return ser.reindex(ax)._values[indexer] + + elif is_integer(indexer): + ax = self.obj._get_axis(1) + + if ser.index.equals(ax): + return ser._values.copy() + + return ser.reindex(ax)._values + + raise ValueError("Incompatible indexer with Series") + + def _align_frame(self, indexer, df: DataFrame) -> DataFrame: + is_frame = self.ndim == 2 + + if isinstance(indexer, tuple): + idx, cols = None, None + sindexers = [] + for i, ix in enumerate(indexer): + ax = self.obj.axes[i] + if is_sequence(ix) or isinstance(ix, slice): + if isinstance(ix, np.ndarray): + ix = ix.ravel() + if idx is None: + idx = ax[ix] + elif cols is None: + cols = ax[ix] + else: + break + else: + sindexers.append(i) + + if idx is not None and cols is not None: + if df.index.equals(idx) and df.columns.equals(cols): + val = df.copy() + else: + val = df.reindex(idx, columns=cols) + return val + + elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: + ax = self.obj.index[indexer] + if df.index.equals(ax): + val = df.copy() + else: + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if ( + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) + and ax.nlevels != df.index.nlevels + ): + raise TypeError( + "cannot align on a multi-index with out " + "specifying the join levels" + ) + + val = df.reindex(index=ax) + return val + + raise ValueError("Incompatible indexer with DataFrame") + + +class _ScalarAccessIndexer(NDFrameIndexerBase): + """ + Access scalars quickly. + """ + + # sub-classes need to set _takeable + _takeable: bool + + def _convert_key(self, key): + raise AbstractMethodError(self) + + def __getitem__(self, key): + if not isinstance(key, tuple): + # we could have a convertible item here (e.g. Timestamp) + if not is_list_like_indexer(key): + key = (key,) + else: + raise ValueError("Invalid call for scalar access (getting)!") + + key = self._convert_key(key) + return self.obj._get_value(*key, takeable=self._takeable) + + def __setitem__(self, key, value) -> None: + if isinstance(key, tuple): + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + # scalar callable may return tuple + key = com.apply_if_callable(key, self.obj) + + if not isinstance(key, tuple): + key = _tuplify(self.ndim, key) + key = list(self._convert_key(key)) + if len(key) != self.ndim: + raise ValueError("Not enough indexers for scalar access (setting)!") + + self.obj._set_value(*key, value=value, takeable=self._takeable) + + +@doc(IndexingMixin.at) +class _AtIndexer(_ScalarAccessIndexer): + _takeable = False + + def _convert_key(self, key): + """ + Require they keys to be the same type as the index. (so we don't + fallback) + """ + # GH 26989 + # For series, unpacking key needs to result in the label. + # This is already the case for len(key) == 1; e.g. (1,) + if self.ndim == 1 and len(key) > 1: + key = (key,) + + return key + + @property + def _axes_are_unique(self) -> bool: + # Only relevant for self.ndim == 2 + assert self.ndim == 2 + return self.obj.index.is_unique and self.obj.columns.is_unique + + def __getitem__(self, key): + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (getting)!") + return self.obj.loc[key] + + return super().__getitem__(key) + + def __setitem__(self, key, value) -> None: + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (setting)!") + + self.obj.loc[key] = value + return + + return super().__setitem__(key, value) + + +@doc(IndexingMixin.iat) +class _iAtIndexer(_ScalarAccessIndexer): + _takeable = True + + def _convert_key(self, key): + """ + Require integer args. (and convert to label arguments) + """ + for i in key: + if not is_integer(i): + raise ValueError("iAt based indexing can only have integer indexers") + return key + + +def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]: + """ + Given an indexer for the first dimension, create an equivalent tuple + for indexing over all dimensions. + + Parameters + ---------- + ndim : int + loc : object + + Returns + ------- + tuple + """ + _tup: list[Hashable | slice] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) + + +def _tupleize_axis_indexer(ndim: int, axis: AxisInt, key) -> tuple: + """ + If we have an axis, adapt the given key to be axis-independent. + """ + new_key = [slice(None)] * ndim + new_key[axis] = key + return tuple(new_key) + + +def check_bool_indexer(index: Index, key) -> np.ndarray: + """ + Check if key is a valid boolean indexer for an object with such index and + perform reindexing or conversion if needed. + + This function assumes that is_bool_indexer(key) == True. + + Parameters + ---------- + index : Index + Index of the object on which the indexing is done. + key : list-like + Boolean indexer to check. + + Returns + ------- + np.array + Resulting key. + + Raises + ------ + IndexError + If the key does not have the same length as index. + IndexingError + If the index of the key is unalignable to index. + """ + result = key + if isinstance(key, ABCSeries) and not key.index.equals(index): + indexer = result.index.get_indexer_for(index) + if -1 in indexer: + raise IndexingError( + "Unalignable boolean Series provided as " + "indexer (index of the boolean Series and of " + "the indexed object do not match)." + ) + + result = result.take(indexer) + + # fall through for boolean + if not isinstance(result.dtype, ExtensionDtype): + return result.astype(bool)._values + + if is_object_dtype(key): + # key might be object-dtype bool, check_array_indexer needs bool array + result = np.asarray(result, dtype=bool) + elif not is_array_like(result): + # GH 33924 + # key may contain nan elements, check_array_indexer needs bool array + result = pd_array(result, dtype=bool) + return check_array_indexer(index, result) + + +def convert_missing_indexer(indexer): + """ + Reverse convert a missing indexer, which is a dict + return the scalar indexer and a boolean indicating if we converted + """ + if isinstance(indexer, dict): + # a missing key (but not a tuple indexer) + indexer = indexer["key"] + + if isinstance(indexer, bool): + raise KeyError("cannot use a single bool to index into setitem") + return indexer, True + + return indexer, False + + +def convert_from_missing_indexer_tuple(indexer, axes): + """ + Create a filtered indexer that doesn't have any missing indexers. + """ + + def get_indexer(_i, _idx): + return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx + + return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)) + + +def maybe_convert_ix(*args): + """ + We likely want to take the cross-product. + """ + for arg in args: + if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)): + return args + return np.ix_(*args) + + +def is_nested_tuple(tup, labels) -> bool: + """ + Returns + ------- + bool + """ + # check for a compatible nested tuple and multiindexes among the axes + if not isinstance(tup, tuple): + return False + + for k in tup: + if is_list_like(k) or isinstance(k, slice): + return isinstance(labels, MultiIndex) + + return False + + +def is_label_like(key) -> bool: + """ + Returns + ------- + bool + """ + # select a label or row + return ( + not isinstance(key, slice) + and not is_list_like_indexer(key) + and key is not Ellipsis + ) + + +def need_slice(obj: slice) -> bool: + """ + Returns + ------- + bool + """ + return ( + obj.start is not None + or obj.stop is not None + or (obj.step is not None and obj.step != 1) + ) + + +def check_dict_or_set_indexers(key) -> None: + """ + Check if the indexer is or contains a dict or set, which is no longer allowed. + """ + if ( + isinstance(key, set) + or isinstance(key, tuple) + and any(isinstance(x, set) for x in key) + ): + raise TypeError( + "Passing a set as an indexer is not supported. Use a list instead." + ) + + if ( + isinstance(key, dict) + or isinstance(key, tuple) + and any(isinstance(x, dict) for x in key) + ): + raise TypeError( + "Passing a dict as an indexer is not supported. Use a list instead." + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/missing.py new file mode 100644 index 0000000000000000000000000000000000000000..c016aab8ad0748220d04d047f84d3aa14564ab80 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/missing.py @@ -0,0 +1,1158 @@ +""" +Routines for filling missing data. +""" +from __future__ import annotations + +from functools import wraps +from typing import ( + TYPE_CHECKING, + Any, + Literal, + cast, + overload, +) + +import numpy as np + +from pandas._libs import ( + NaT, + algos, + lib, +) +from pandas._typing import ( + ArrayLike, + AxisInt, + F, + ReindexMethod, + npt, +) +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import infer_dtype_from +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_numeric_dtype, + is_numeric_v_string_like, + is_object_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + na_value_for_dtype, +) + +if TYPE_CHECKING: + from pandas import Index + + +def check_value_size(value, mask: npt.NDArray[np.bool_], length: int): + """ + Validate the size of the values passed to ExtensionArray.fillna. + """ + if is_array_like(value): + if len(value) != length: + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {length}" + ) + value = value[mask] + + return value + + +def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: + """ + Return a masking array of same size/shape as arr + with entries equaling any member of values_to_mask set to True + + Parameters + ---------- + arr : ArrayLike + values_to_mask: list, tuple, or scalar + + Returns + ------- + np.ndarray[bool] + """ + # When called from Block.replace/replace_list, values_to_mask is a scalar + # known to be holdable by arr. + # When called from Series._single_replace, values_to_mask is tuple or list + dtype, values_to_mask = infer_dtype_from(values_to_mask) + + if isinstance(dtype, np.dtype): + values_to_mask = np.array(values_to_mask, dtype=dtype) + else: + cls = dtype.construct_array_type() + if not lib.is_list_like(values_to_mask): + values_to_mask = [values_to_mask] + values_to_mask = cls._from_sequence(values_to_mask, dtype=dtype, copy=False) + + potential_na = False + if is_object_dtype(arr.dtype): + # pre-compute mask to avoid comparison to NA + potential_na = True + arr_mask = ~isna(arr) + + na_mask = isna(values_to_mask) + nonna = values_to_mask[~na_mask] + + # GH 21977 + mask = np.zeros(arr.shape, dtype=bool) + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass + else: + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x + + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask + + if na_mask.any(): + mask |= isna(arr) + + return mask + + +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill"], + *, + allow_nearest: Literal[False] = ..., +) -> Literal["pad", "backfill"]: + ... + + +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: Literal[True], +) -> Literal["pad", "backfill", "nearest"]: + ... + + +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: bool = False, +) -> Literal["pad", "backfill", "nearest"]: + if isinstance(method, str): + # error: Incompatible types in assignment (expression has type "str", variable + # has type "Literal['ffill', 'pad', 'bfill', 'backfill', 'nearest']") + method = method.lower() # type: ignore[assignment] + if method == "ffill": + method = "pad" + elif method == "bfill": + method = "backfill" + + valid_methods = ["pad", "backfill"] + expecting = "pad (ffill) or backfill (bfill)" + if allow_nearest: + valid_methods.append("nearest") + expecting = "pad (ffill), backfill (bfill) or nearest" + if method not in valid_methods: + raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}") + return method + + +# interpolation methods that dispatch to np.interp + +NP_METHODS = ["linear", "time", "index", "values"] + +# interpolation methods that dispatch to _interpolate_scipy_wrapper + +SP_METHODS = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + "cubicspline", +] + + +def clean_interp_method(method: str, index: Index, **kwargs) -> str: + order = kwargs.get("order") + + if method in ("spline", "polynomial") and order is None: + raise ValueError("You must specify the order of the spline or polynomial.") + + valid = NP_METHODS + SP_METHODS + if method not in valid: + raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") + + if method in ("krogh", "piecewise_polynomial", "pchip"): + if not index.is_monotonic_increasing: + raise ValueError( + f"{method} interpolation requires that the index be monotonic." + ) + + return method + + +def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None: + """ + Retrieves the positional index of the first valid value. + + Parameters + ---------- + how : {'first', 'last'} + Use this parameter to change between the first or last valid index. + is_valid: np.ndarray + Mask to find na_values. + + Returns + ------- + int or None + """ + assert how in ["first", "last"] + + if len(is_valid) == 0: # early stop + return None + + if is_valid.ndim == 2: + is_valid = is_valid.any(axis=1) # reduce axis 1 + + if how == "first": + idxpos = is_valid[::].argmax() + + elif how == "last": + idxpos = len(is_valid) - 1 - is_valid[::-1].argmax() + + chk_notna = is_valid[idxpos] + + if not chk_notna: + return None + # Incompatible return value type (got "signedinteger[Any]", + # expected "Optional[int]") + return idxpos # type: ignore[return-value] + + +def validate_limit_direction( + limit_direction: str, +) -> Literal["forward", "backward", "both"]: + valid_limit_directions = ["forward", "backward", "both"] + limit_direction = limit_direction.lower() + if limit_direction not in valid_limit_directions: + raise ValueError( + "Invalid limit_direction: expecting one of " + f"{valid_limit_directions}, got '{limit_direction}'." + ) + # error: Incompatible return value type (got "str", expected + # "Literal['forward', 'backward', 'both']") + return limit_direction # type: ignore[return-value] + + +def validate_limit_area(limit_area: str | None) -> Literal["inside", "outside"] | None: + if limit_area is not None: + valid_limit_areas = ["inside", "outside"] + limit_area = limit_area.lower() + if limit_area not in valid_limit_areas: + raise ValueError( + f"Invalid limit_area: expecting one of {valid_limit_areas}, got " + f"{limit_area}." + ) + # error: Incompatible return value type (got "Optional[str]", expected + # "Optional[Literal['inside', 'outside']]") + return limit_area # type: ignore[return-value] + + +def infer_limit_direction( + limit_direction: Literal["backward", "forward", "both"] | None, method: str +) -> Literal["backward", "forward", "both"]: + # Set `limit_direction` depending on `method` + if limit_direction is None: + if method in ("backfill", "bfill"): + limit_direction = "backward" + else: + limit_direction = "forward" + else: + if method in ("pad", "ffill") and limit_direction != "forward": + raise ValueError( + f"`limit_direction` must be 'forward' for method `{method}`" + ) + if method in ("backfill", "bfill") and limit_direction != "backward": + raise ValueError( + f"`limit_direction` must be 'backward' for method `{method}`" + ) + return limit_direction + + +def get_interp_index(method, index: Index) -> Index: + # create/use the index + if method == "linear": + # prior default + from pandas import Index + + index = Index(np.arange(len(index))) + else: + methods = {"index", "values", "nearest", "time"} + is_numeric_or_datetime = ( + is_numeric_dtype(index.dtype) + or isinstance(index.dtype, DatetimeTZDtype) + or lib.is_np_dtype(index.dtype, "mM") + ) + if method not in methods and not is_numeric_or_datetime: + raise ValueError( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + + if isna(index).any(): + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) + return index + + +def interpolate_2d_inplace( + data: np.ndarray, # floating dtype + index: Index, + axis: AxisInt, + method: str = "linear", + limit: int | None = None, + limit_direction: str = "forward", + limit_area: str | None = None, + fill_value: Any | None = None, + mask=None, + **kwargs, +) -> None: + """ + Column-wise application of _interpolate_1d. + + Notes + ----- + Alters 'data' in-place. + + The signature does differ from _interpolate_1d because it only + includes what is needed for Block.interpolate. + """ + # validate the interp method + clean_interp_method(method, index, **kwargs) + + if is_valid_na_for_dtype(fill_value, data.dtype): + fill_value = na_value_for_dtype(data.dtype, compat=False) + + if method == "time": + if not needs_i8_conversion(index.dtype): + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a " + "DatetimeIndex" + ) + method = "values" + + limit_direction = validate_limit_direction(limit_direction) + limit_area_validated = validate_limit_area(limit_area) + + # default limit is unlimited GH #16282 + limit = algos.validate_limit(nobs=None, limit=limit) + + indices = _index_to_interp_indices(index, method) + + def func(yvalues: np.ndarray) -> None: + # process 1-d slices in the axis direction + + _interpolate_1d( + indices=indices, + yvalues=yvalues, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area_validated, + fill_value=fill_value, + bounds_error=False, + mask=mask, + **kwargs, + ) + + # error: Argument 1 to "apply_along_axis" has incompatible type + # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[..., + # Union[_SupportsArray[dtype[]], Sequence[_SupportsArray + # [dtype[]]], Sequence[Sequence[_SupportsArray[dtype[]]]], + # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], + # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]]]]]" + np.apply_along_axis(func, axis, data) # type: ignore[arg-type] + + +def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: + """ + Convert Index to ndarray of indices to pass to NumPy/SciPy. + """ + xarr = index._values + if needs_i8_conversion(xarr.dtype): + # GH#1646 for dt64tz + xarr = xarr.view("i8") + + if method == "linear": + inds = xarr + inds = cast(np.ndarray, inds) + else: + inds = np.asarray(xarr) + + if method in ("values", "index"): + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + + return inds + + +def _interpolate_1d( + indices: np.ndarray, + yvalues: np.ndarray, + method: str = "linear", + limit: int | None = None, + limit_direction: str = "forward", + limit_area: Literal["inside", "outside"] | None = None, + fill_value: Any | None = None, + bounds_error: bool = False, + order: int | None = None, + mask=None, + **kwargs, +) -> None: + """ + Logic for the 1-d interpolation. The input + indices and yvalues will each be 1-d arrays of the same length. + + Bounds_error is currently hardcoded to False since non-scipy ones don't + take it as an argument. + + Notes + ----- + Fills 'yvalues' in-place. + """ + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) + valid = ~invalid + + if not valid.any(): + return + + if valid.all(): + return + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + + first_valid_index = find_valid_index(how="first", is_valid=valid) + if first_valid_index is None: # no nan found in start + first_valid_index = 0 + start_nans = set(range(first_valid_index)) + + last_valid_index = find_valid_index(how="last", is_valid=valid) + if last_valid_index is None: # no nan found in end + last_valid_index = len(yvalues) + end_nans = set(range(1 + last_valid_index, len(valid))) + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than 'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + preserve_nans: list | set + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == "inside": + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == "outside": + # preserve NaNs on the inside + mid_nans = all_nans - start_nans - end_nans + preserve_nans |= mid_nans + + # sort preserve_nans and convert to list + preserve_nans = sorted(preserve_nans) + + is_datetimelike = yvalues.dtype.kind in "mM" + + if is_datetimelike: + yvalues = yvalues.view("i8") + + if method in NP_METHODS: + # np.interp requires sorted X values, #21037 + + indexer = np.argsort(indices[valid]) + yvalues[invalid] = np.interp( + indices[invalid], indices[valid][indexer], yvalues[valid][indexer] + ) + else: + yvalues[invalid] = _interpolate_scipy_wrapper( + indices[valid], + yvalues[valid], + indices[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs, + ) + + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: + yvalues[preserve_nans] = NaT.value + else: + yvalues[preserve_nans] = np.nan + return + + +def _interpolate_scipy_wrapper( + x: np.ndarray, + y: np.ndarray, + new_x: np.ndarray, + method: str, + fill_value=None, + bounds_error: bool = False, + order=None, + **kwargs, +): + """ + Passed off to scipy.interpolate.interp1d. method is scipy's kind. + Returns an array interpolated at new_x. Add any new methods to + the list in _clean_interp_method. + """ + extra = f"{method} interpolation requires SciPy." + import_optional_dependency("scipy", extra=extra) + from scipy import interpolate + + new_x = np.asarray(new_x) + + # ignores some kwargs that could be passed along. + alt_methods = { + "barycentric": interpolate.barycentric_interpolate, + "krogh": interpolate.krogh_interpolate, + "from_derivatives": _from_derivatives, + "piecewise_polynomial": _from_derivatives, + "cubicspline": _cubicspline_interpolate, + "akima": _akima_interpolate, + "pchip": interpolate.pchip_interpolate, + } + + interp1d_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "polynomial", + ] + if method in interp1d_methods: + if method == "polynomial": + kind = order + else: + kind = method + terp = interpolate.interp1d( + x, y, kind=kind, fill_value=fill_value, bounds_error=bounds_error + ) + new_y = terp(new_x) + elif method == "spline": + # GH #10633, #24014 + if isna(order) or (order <= 0): + raise ValueError( + f"order needs to be specified and greater than 0; got order: {order}" + ) + terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) + new_y = terp(new_x) + else: + # GH 7295: need to be able to write for some reason + # in some circumstances: check all three + if not x.flags.writeable: + x = x.copy() + if not y.flags.writeable: + y = y.copy() + if not new_x.flags.writeable: + new_x = new_x.copy() + terp = alt_methods[method] + new_y = terp(x, y, new_x, **kwargs) + return new_y + + +def _from_derivatives( + xi: np.ndarray, + yi: np.ndarray, + x: np.ndarray, + order=None, + der: int | list[int] | None = 0, + extrapolate: bool = False, +): + """ + Convenience function for interpolate.BPoly.from_derivatives. + + Construct a piecewise polynomial in the Bernstein basis, compatible + with the specified values and derivatives at breakpoints. + + Parameters + ---------- + xi : array-like + sorted 1D array of x-coordinates + yi : array-like or list of array-likes + yi[i][j] is the j-th derivative known at xi[i] + order: None or int or array-like of ints. Default: None. + Specifies the degree of local polynomials. If not None, some + derivatives are ignored. + der : int or list + How many derivatives to extract; None for all potentially nonzero + derivatives (that is a number equal to the number of points), or a + list of derivatives to extract. This number includes the function + value as 0th derivative. + extrapolate : bool, optional + Whether to extrapolate to ouf-of-bounds points based on first and last + intervals, or to return NaNs. Default: True. + + See Also + -------- + scipy.interpolate.BPoly.from_derivatives + + Returns + ------- + y : scalar or array-like + The result, of length R or length M or M by R. + """ + from scipy import interpolate + + # return the method for compat with scipy version & backwards compat + method = interpolate.BPoly.from_derivatives + m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate) + + return m(x) + + +def _akima_interpolate( + xi: np.ndarray, + yi: np.ndarray, + x: np.ndarray, + der: int | list[int] | None = 0, + axis: AxisInt = 0, +): + """ + Convenience function for akima interpolation. + xi and yi are arrays of values used to approximate some function f, + with ``yi = f(xi)``. + + See `Akima1DInterpolator` for details. + + Parameters + ---------- + xi : np.ndarray + A sorted list of x-coordinates, of length N. + yi : np.ndarray + A 1-D array of real values. `yi`'s length along the interpolation + axis must be equal to the length of `xi`. If N-D array, use axis + parameter to select correct axis. + x : np.ndarray + Of length M. + der : int, optional + How many derivatives to extract; None for all potentially + nonzero derivatives (that is a number equal to the number + of points), or a list of derivatives to extract. This number + includes the function value as 0th derivative. + axis : int, optional + Axis in the yi array corresponding to the x-coordinate values. + + See Also + -------- + scipy.interpolate.Akima1DInterpolator + + Returns + ------- + y : scalar or array-like + The result, of length R or length M or M by R, + + """ + from scipy import interpolate + + P = interpolate.Akima1DInterpolator(xi, yi, axis=axis) + + return P(x, nu=der) + + +def _cubicspline_interpolate( + xi: np.ndarray, + yi: np.ndarray, + x: np.ndarray, + axis: AxisInt = 0, + bc_type: str | tuple[Any, Any] = "not-a-knot", + extrapolate=None, +): + """ + Convenience function for cubic spline data interpolator. + + See `scipy.interpolate.CubicSpline` for details. + + Parameters + ---------- + xi : np.ndarray, shape (n,) + 1-d array containing values of the independent variable. + Values must be real, finite and in strictly increasing order. + yi : np.ndarray + Array containing values of the dependent variable. It can have + arbitrary number of dimensions, but the length along ``axis`` + (see below) must match the length of ``x``. Values must be finite. + x : np.ndarray, shape (m,) + axis : int, optional + Axis along which `y` is assumed to be varying. Meaning that for + ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``. + Default is 0. + bc_type : string or 2-tuple, optional + Boundary condition type. Two additional equations, given by the + boundary conditions, are required to determine all coefficients of + polynomials on each segment [2]_. + If `bc_type` is a string, then the specified condition will be applied + at both ends of a spline. Available conditions are: + * 'not-a-knot' (default): The first and second segment at a curve end + are the same polynomial. It is a good default when there is no + information on boundary conditions. + * 'periodic': The interpolated functions is assumed to be periodic + of period ``x[-1] - x[0]``. The first and last value of `y` must be + identical: ``y[0] == y[-1]``. This boundary condition will result in + ``y'[0] == y'[-1]`` and ``y''[0] == y''[-1]``. + * 'clamped': The first derivative at curves ends are zero. Assuming + a 1D `y`, ``bc_type=((1, 0.0), (1, 0.0))`` is the same condition. + * 'natural': The second derivative at curve ends are zero. Assuming + a 1D `y`, ``bc_type=((2, 0.0), (2, 0.0))`` is the same condition. + If `bc_type` is a 2-tuple, the first and the second value will be + applied at the curve start and end respectively. The tuple values can + be one of the previously mentioned strings (except 'periodic') or a + tuple `(order, deriv_values)` allowing to specify arbitrary + derivatives at curve ends: + * `order`: the derivative order, 1 or 2. + * `deriv_value`: array-like containing derivative values, shape must + be the same as `y`, excluding ``axis`` dimension. For example, if + `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with + the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D + and have the shape (n0, n1). + extrapolate : {bool, 'periodic', None}, optional + If bool, determines whether to extrapolate to out-of-bounds points + based on first and last intervals, or to return NaNs. If 'periodic', + periodic extrapolation is used. If None (default), ``extrapolate`` is + set to 'periodic' for ``bc_type='periodic'`` and to True otherwise. + + See Also + -------- + scipy.interpolate.CubicHermiteSpline + + Returns + ------- + y : scalar or array-like + The result, of shape (m,) + + References + ---------- + .. [1] `Cubic Spline Interpolation + `_ + on Wikiversity. + .. [2] Carl de Boor, "A Practical Guide to Splines", Springer-Verlag, 1978. + """ + from scipy import interpolate + + P = interpolate.CubicSpline( + xi, yi, axis=axis, bc_type=bc_type, extrapolate=extrapolate + ) + + return P(x) + + +def _interpolate_with_limit_area( + values: np.ndarray, + method: Literal["pad", "backfill"], + limit: int | None, + limit_area: Literal["inside", "outside"], +) -> None: + """ + Apply interpolation and limit_area logic to values along a to-be-specified axis. + + Parameters + ---------- + values: np.ndarray + Input array. + method: str + Interpolation method. Could be "bfill" or "pad" + limit: int, optional + Index limit on interpolation. + limit_area: {'inside', 'outside'} + Limit area for interpolation. + + Notes + ----- + Modifies values in-place. + """ + + invalid = isna(values) + is_valid = ~invalid + + if not invalid.all(): + first = find_valid_index(how="first", is_valid=is_valid) + if first is None: + first = 0 + last = find_valid_index(how="last", is_valid=is_valid) + if last is None: + last = len(values) + + pad_or_backfill_inplace( + values, + method=method, + limit=limit, + limit_area=limit_area, + ) + + if limit_area == "inside": + invalid[first : last + 1] = False + elif limit_area == "outside": + invalid[:first] = invalid[last + 1 :] = False + else: + raise ValueError("limit_area should be 'inside' or 'outside'") + + values[invalid] = np.nan + + +def pad_or_backfill_inplace( + values: np.ndarray, + method: Literal["pad", "backfill"] = "pad", + axis: AxisInt = 0, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, +) -> None: + """ + Perform an actual interpolation of values, values will be make 2-d if + needed fills inplace, returns the result. + + Parameters + ---------- + values: np.ndarray + Input array. + method: str, default "pad" + Interpolation method. Could be "bfill" or "pad" + axis: 0 or 1 + Interpolation axis + limit: int, optional + Index limit on interpolation. + limit_area: str, optional + Limit area for interpolation. Can be "inside" or "outside" + + Notes + ----- + Modifies values in-place. + """ + transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + + # reshape a 1 dim if needed + if values.ndim == 1: + if axis != 0: # pragma: no cover + raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") + values = values.reshape(tuple((1,) + values.shape)) + + method = clean_fill_method(method) + tvalues = transf(values) + + func = get_fill_func(method, ndim=2) + # _pad_2d and _backfill_2d both modify tvalues inplace + func(tvalues, limit=limit, limit_area=limit_area) + + +def _fillna_prep( + values, mask: npt.NDArray[np.bool_] | None = None +) -> npt.NDArray[np.bool_]: + # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d + + if mask is None: + mask = isna(values) + + return mask + + +def _datetimelike_compat(func: F) -> F: + """ + Wrapper to handle datetime64 and timedelta64 dtypes. + """ + + @wraps(func) + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): + if needs_i8_conversion(values.dtype): + if mask is None: + # This needs to occur before casting to int64 + mask = isna(values) + + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) + return result.view(values.dtype), mask + + return func(values, limit=limit, limit_area=limit_area, mask=mask) + + return cast(F, new_func) + + +@_datetimelike_compat +def _pad_1d( + values: np.ndarray, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: + mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) + algos.pad_inplace(values, mask, limit=limit) + return values, mask + + +@_datetimelike_compat +def _backfill_1d( + values: np.ndarray, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: + mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) + algos.backfill_inplace(values, mask, limit=limit) + return values, mask + + +@_datetimelike_compat +def _pad_2d( + values: np.ndarray, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, +): + mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) + + if values.size: + algos.pad_2d_inplace(values, mask, limit=limit) + else: + # for test coverage + pass + return values, mask + + +@_datetimelike_compat +def _backfill_2d( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, +): + mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) + + if values.size: + algos.backfill_2d_inplace(values, mask, limit=limit) + else: + # for test coverage + pass + return values, mask + + +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + +_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} + + +def get_fill_func(method, ndim: int = 1): + method = clean_fill_method(method) + if ndim == 1: + return _fill_methods[method] + return {"pad": _pad_2d, "backfill": _backfill_2d}[method] + + +def clean_reindex_fill_method(method) -> ReindexMethod | None: + if method is None: + return None + return clean_fill_method(method, allow_nearest=True) + + +def _interp_limit( + invalid: npt.NDArray[np.bool_], fw_limit: int | None, bw_limit: int | None +): + """ + Get indexers of values that won't be filled + because they exceed the limits. + + Parameters + ---------- + invalid : np.ndarray[bool] + fw_limit : int or None + forward limit to index + bw_limit : int or None + backward limit to index + + Returns + ------- + set of indexers + + Notes + ----- + This is equivalent to the more readable, but slower + + .. code-block:: python + + def _interp_limit(invalid, fw_limit, bw_limit): + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x + """ + # handle forward first; the backward direction is the same except + # 1. operate on the reversed array + # 2. subtract the returned indices from N - 1 + N = len(invalid) + f_idx = set() + b_idx = set() + + def inner(invalid, limit: int): + limit = min(limit, N) + windowed = _rolling_window(invalid, limit + 1).all(1) + idx = set(np.where(windowed)[0] + limit) | set( + np.where((~invalid[: limit + 1]).cumsum() == 0)[0] + ) + return idx + + if fw_limit is not None: + if fw_limit == 0: + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit is not None: + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx_inv = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx_inv)) + if fw_limit == 0: + return b_idx + + return f_idx & b_idx + + +def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.bool_]: + """ + [True, True, False, True, False], 2 -> + + [ + [True, True], + [True, False], + [False, True], + [True, False], + ] + """ + # https://stackoverflow.com/a/6811241 + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/nanops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/nanops.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb825e9b79a25742d6f9f622b7226dbd68d88fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/nanops.py @@ -0,0 +1,1748 @@ +from __future__ import annotations + +import functools +import itertools +from typing import ( + Any, + Callable, + cast, +) +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import ( + NaT, + NaTType, + iNaT, + lib, +) +from pandas._typing import ( + ArrayLike, + AxisInt, + CorrelationMethod, + Dtype, + DtypeObj, + F, + Scalar, + Shape, + npt, +) +from pandas.compat._optional import import_optional_dependency +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_complex, + is_float, + is_float_dtype, + is_integer, + is_numeric_dtype, + is_object_dtype, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, +) + +bn = import_optional_dependency("bottleneck", errors="warn") +_BOTTLENECK_INSTALLED = bn is not None +_USE_BOTTLENECK = False + + +def set_use_bottleneck(v: bool = True) -> None: + # set/unset to use bottleneck + global _USE_BOTTLENECK + if _BOTTLENECK_INSTALLED: + _USE_BOTTLENECK = v + + +set_use_bottleneck(get_option("compute.use_bottleneck")) + + +class disallow: + def __init__(self, *dtypes: Dtype) -> None: + super().__init__() + self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) + + def check(self, obj) -> bool: + return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) + + def __call__(self, f: F) -> F: + @functools.wraps(f) + def _f(*args, **kwargs): + obj_iter = itertools.chain(args, kwargs.values()) + if any(self.check(obj) for obj in obj_iter): + f_name = f.__name__.replace("nan", "") + raise TypeError( + f"reduction operation '{f_name}' not allowed for this dtype" + ) + try: + return f(*args, **kwargs) + except ValueError as e: + # we want to transform an object array + # ValueError message to the more typical TypeError + # e.g. this is normally a disallowed function on + # object arrays that contain strings + if is_object_dtype(args[0]): + raise TypeError(e) from e + raise + + return cast(F, _f) + + +class bottleneck_switch: + def __init__(self, name=None, **kwargs) -> None: + self.name = name + self.kwargs = kwargs + + def __call__(self, alt: F) -> F: + bn_name = self.name or alt.__name__ + + try: + bn_func = getattr(bn, bn_name) + except (AttributeError, NameError): # pragma: no cover + bn_func = None + + @functools.wraps(alt) + def f( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + **kwds, + ): + if len(self.kwargs) > 0: + for k, v in self.kwargs.items(): + if k not in kwds: + kwds[k] = v + + if values.size == 0 and kwds.get("min_count") is None: + # We are empty, returning NA for our type + # Only applies for the default `min_count` of None + # since that affects how empty arrays are handled. + # TODO(GH-18976) update all the nanops methods to + # correctly handle empty inputs and remove this check. + # It *may* just be `var` + return _na_for_min_count(values, axis) + + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): + if kwds.get("mask", None) is None: + # `mask` is not recognised by bottleneck, would raise + # TypeError if called + kwds.pop("mask", None) + result = bn_func(values, axis=axis, **kwds) + + # prefer to treat inf/-inf as NA, but must compute the func + # twice :( + if _has_infs(result): + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + + return result + + return cast(F, f) + + +def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: + # Bottleneck chokes on datetime64, PeriodDtype (or and EA) + if dtype != object and not needs_i8_conversion(dtype): + # GH 42878 + # Bottleneck uses naive summation leading to O(n) loss of precision + # unlike numpy which implements pairwise summation, which has O(log(n)) loss + # crossref: https://github.com/pydata/bottleneck/issues/379 + + # GH 15507 + # bottleneck does not properly upcast during the sum + # so can overflow + + # GH 9422 + # further we also want to preserve NaN when all elements + # are NaN, unlike bottleneck/numpy which consider this + # to be 0 + return name not in ["nansum", "nanprod", "nanmean"] + return False + + +def _has_infs(result) -> bool: + if isinstance(result, np.ndarray): + if result.dtype in ("f8", "f4"): + # Note: outside of an nanops-specific test, we always have + # result.ndim == 1, so there is no risk of this ravel making a copy. + return lib.has_infs(result.ravel("K")) + try: + return np.isinf(result).any() + except (TypeError, NotImplementedError): + # if it doesn't support infs, then it can't have infs + return False + + +def _get_fill_value( + dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None +): + """return the correct fill value for the dtype of the values""" + if fill_value is not None: + return fill_value + if _na_ok_dtype(dtype): + if fill_value_typ is None: + return np.nan + else: + if fill_value_typ == "+inf": + return np.inf + else: + return -np.inf + else: + if fill_value_typ == "+inf": + # need the max int here + return lib.i8max + else: + return iNaT + + +def _maybe_get_mask( + values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None +) -> npt.NDArray[np.bool_] | None: + """ + Compute a mask if and only if necessary. + + This function will compute a mask iff it is necessary. Otherwise, + return the provided mask (potentially None) when a mask does not need to be + computed. + + A mask is never necessary if the values array is of boolean or integer + dtypes, as these are incapable of storing NaNs. If passing a NaN-capable + dtype that is interpretable as either boolean or integer data (eg, + timedelta64), a mask must be provided. + + If the skipna parameter is False, a new mask will not be computed. + + The mask is computed using isna() by default. Setting invert=True selects + notna() as the masking function. + + Parameters + ---------- + values : ndarray + input array to potentially compute mask for + skipna : bool + boolean for whether NaNs should be skipped + mask : Optional[ndarray] + nan-mask if known + + Returns + ------- + Optional[np.ndarray[bool]] + """ + if mask is None: + if values.dtype.kind in "biu": + # Boolean data cannot contain nulls, so signal via mask being None + return None + + if skipna or values.dtype.kind in "mM": + mask = isna(values) + + return mask + + +def _get_values( + values: np.ndarray, + skipna: bool, + fill_value: Any = None, + fill_value_typ: str | None = None, + mask: npt.NDArray[np.bool_] | None = None, +) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None]: + """ + Utility to get the values view, mask, dtype, dtype_max, and fill_value. + + If both mask and fill_value/fill_value_typ are not None and skipna is True, + the values array will be copied. + + For input arrays of boolean or integer dtypes, copies will only occur if a + precomputed mask, a fill_value/fill_value_typ, and skipna=True are + provided. + + Parameters + ---------- + values : ndarray + input array to potentially compute mask for + skipna : bool + boolean for whether NaNs should be skipped + fill_value : Any + value to fill NaNs with + fill_value_typ : str + Set to '+inf' or '-inf' to handle dtype-specific infinities + mask : Optional[np.ndarray[bool]] + nan-mask if known + + Returns + ------- + values : ndarray + Potential copy of input value array + mask : Optional[ndarray[bool]] + Mask for values, if deemed necessary to compute + """ + # In _get_values is only called from within nanops, and in all cases + # with scalar fill_value. This guarantee is important for the + # np.where call below + + mask = _maybe_get_mask(values, skipna, mask) + + dtype = values.dtype + + datetimelike = False + if values.dtype.kind in "mM": + # changing timedelta64/datetime64 to int64 needs to happen after + # finding `mask` above + values = np.asarray(values.view("i8")) + datetimelike = True + + if skipna and (mask is not None): + # get our fill value (in case we need to provide an alternative + # dtype for it) + fill_value = _get_fill_value( + dtype, fill_value=fill_value, fill_value_typ=fill_value_typ + ) + + if fill_value is not None: + if mask.any(): + if datetimelike or _na_ok_dtype(dtype): + values = values.copy() + np.putmask(values, mask, fill_value) + else: + # np.where will promote if needed + values = np.where(~mask, values, fill_value) + + return values, mask + + +def _get_dtype_max(dtype: np.dtype) -> np.dtype: + # return a platform independent precision dtype + dtype_max = dtype + if dtype.kind in "bi": + dtype_max = np.dtype(np.int64) + elif dtype.kind == "u": + dtype_max = np.dtype(np.uint64) + elif dtype.kind == "f": + dtype_max = np.dtype(np.float64) + return dtype_max + + +def _na_ok_dtype(dtype: DtypeObj) -> bool: + if needs_i8_conversion(dtype): + return False + return not issubclass(dtype.type, np.integer) + + +def _wrap_results(result, dtype: np.dtype, fill_value=None): + """wrap our results if needed""" + if result is NaT: + pass + + elif dtype.kind == "M": + if fill_value is None: + # GH#24293 + fill_value = iNaT + if not isinstance(result, np.ndarray): + assert not isna(fill_value), "Expected non-null fill_value" + if result == fill_value: + result = np.nan + + if isna(result): + result = np.datetime64("NaT", "ns").astype(dtype) + else: + result = np.int64(result).view(dtype) + # retain original unit + result = result.astype(dtype, copy=False) + else: + # If we have float dtype, taking a view will give the wrong result + result = result.astype(dtype) + elif dtype.kind == "m": + if not isinstance(result, np.ndarray): + if result == fill_value or np.isnan(result): + result = np.timedelta64("NaT").astype(dtype) + + elif np.fabs(result) > lib.i8max: + # raise if we have a timedelta64[ns] which is too large + raise ValueError("overflow in timedelta operation") + else: + # return a timedelta64 with the original unit + result = np.int64(result).astype(dtype, copy=False) + + else: + result = result.astype("m8[ns]").view(dtype) + + return result + + +def _datetimelike_compat(func: F) -> F: + """ + If we have datetime64 or timedelta64 values, ensure we have a correct + mask before calling the wrapped function, then cast back afterwards. + """ + + @functools.wraps(func) + def new_func( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, + **kwargs, + ): + orig_values = values + + datetimelike = values.dtype.kind in "mM" + if datetimelike and mask is None: + mask = isna(values) + + result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs) + + if datetimelike: + result = _wrap_results(result, orig_values.dtype, fill_value=iNaT) + if not skipna: + assert mask is not None # checked above + result = _mask_datetimelike_result(result, axis, mask, orig_values) + + return result + + return cast(F, new_func) + + +def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray: + """ + Return the missing value for `values`. + + Parameters + ---------- + values : ndarray + axis : int or None + axis for the reduction, required if values.ndim > 1. + + Returns + ------- + result : scalar or ndarray + For 1-D values, returns a scalar of the correct missing type. + For 2-D values, returns a 1-D array where each element is missing. + """ + # we either return np.nan or pd.NaT + if values.dtype.kind in "iufcb": + values = values.astype("float64") + fill_value = na_value_for_dtype(values.dtype) + + if values.ndim == 1: + return fill_value + elif axis is None: + return fill_value + else: + result_shape = values.shape[:axis] + values.shape[axis + 1 :] + + return np.full(result_shape, fill_value, dtype=values.dtype) + + +def maybe_operate_rowwise(func: F) -> F: + """ + NumPy operations on C-contiguous ndarrays with axis=1 can be + very slow if axis 1 >> axis 0. + Operate row-by-row and concatenate the results. + """ + + @functools.wraps(func) + def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs): + if ( + axis == 1 + and values.ndim == 2 + and values.flags["C_CONTIGUOUS"] + # only takes this path for wide arrays (long dataframes), for threshold see + # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737 + and (values.shape[1] / 1000) > values.shape[0] + and values.dtype != object + and values.dtype != bool + ): + arrs = list(values) + if kwargs.get("mask") is not None: + mask = kwargs.pop("mask") + results = [ + func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs)) + ] + else: + results = [func(x, **kwargs) for x in arrs] + return np.array(results) + + return func(values, axis=axis, **kwargs) + + return cast(F, newfunc) + + +def nanany( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> bool: + """ + Check if any elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, 2]) + >>> nanops.nanany(s.values) + True + + >>> from pandas.core import nanops + >>> s = pd.Series([np.nan]) + >>> nanops.nanany(s.values) + False + """ + if values.dtype.kind in "iub" and mask is None: + # GH#26032 fastpath + # error: Incompatible return value type (got "Union[bool_, ndarray]", + # expected "bool") + return values.any(axis) # type: ignore[return-value] + + if values.dtype.kind == "M": + # GH#34479 + warnings.warn( + "'any' with datetime64 dtypes is deprecated and will raise in a " + "future version. Use (obj != pd.Timestamp(0)).any() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + values, _ = _get_values(values, skipna, fill_value=False, mask=mask) + + # For object type, any won't necessarily return + # boolean values (numpy/numpy#4352) + if values.dtype == object: + values = values.astype(bool) + + # error: Incompatible return value type (got "Union[bool_, ndarray]", expected + # "bool") + return values.any(axis) # type: ignore[return-value] + + +def nanall( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> bool: + """ + Check if all elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanall(s.values) + True + + >>> from pandas.core import nanops + >>> s = pd.Series([1, 0]) + >>> nanops.nanall(s.values) + False + """ + if values.dtype.kind in "iub" and mask is None: + # GH#26032 fastpath + # error: Incompatible return value type (got "Union[bool_, ndarray]", + # expected "bool") + return values.all(axis) # type: ignore[return-value] + + if values.dtype.kind == "M": + # GH#34479 + warnings.warn( + "'all' with datetime64 dtypes is deprecated and will raise in a " + "future version. Use (obj != pd.Timestamp(0)).all() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + values, _ = _get_values(values, skipna, fill_value=True, mask=mask) + + # For object type, all won't necessarily return + # boolean values (numpy/numpy#4352) + if values.dtype == object: + values = values.astype(bool) + + # error: Incompatible return value type (got "Union[bool_, ndarray]", expected + # "bool") + return values.all(axis) # type: ignore[return-value] + + +@disallow("M8") +@_datetimelike_compat +@maybe_operate_rowwise +def nansum( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Sum the elements along an axis ignoring NaNs + + Parameters + ---------- + values : ndarray[dtype] + axis : int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : dtype + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nansum(s.values) + 3.0 + """ + dtype = values.dtype + values, mask = _get_values(values, skipna, fill_value=0, mask=mask) + dtype_sum = _get_dtype_max(dtype) + if dtype.kind == "f": + dtype_sum = dtype + elif dtype.kind == "m": + dtype_sum = np.dtype(np.float64) + + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) + + return the_sum + + +def _mask_datetimelike_result( + result: np.ndarray | np.datetime64 | np.timedelta64, + axis: AxisInt | None, + mask: npt.NDArray[np.bool_], + orig_values: np.ndarray, +) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType: + if isinstance(result, np.ndarray): + # we need to apply the mask + result = result.astype("i8").view(orig_values.dtype) + axis_mask = mask.any(axis=axis) + # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any], + # datetime64, timedelta64]") + result[axis_mask] = iNaT # type: ignore[index] + else: + if mask.any(): + return np.int64(iNaT).view(orig_values.dtype) + return result + + +@bottleneck_switch() +@_datetimelike_compat +def nanmean( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Compute the mean of the element along an axis ignoring NaNs + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanmean(s.values) + 1.5 + """ + dtype = values.dtype + values, mask = _get_values(values, skipna, fill_value=0, mask=mask) + dtype_sum = _get_dtype_max(dtype) + dtype_count = np.dtype(np.float64) + + # not using needs_i8_conversion because that includes period + if dtype.kind in "mM": + dtype_sum = np.dtype(np.float64) + elif dtype.kind in "iu": + dtype_sum = np.dtype(np.float64) + elif dtype.kind == "f": + dtype_sum = dtype + dtype_count = dtype + + count = _get_counts(values.shape, mask, axis, dtype=dtype_count) + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _ensure_numeric(the_sum) + + if axis is not None and getattr(the_sum, "ndim", False): + count = cast(np.ndarray, count) + with np.errstate(all="ignore"): + # suppress division by zero warnings + the_mean = the_sum / count + ct_mask = count == 0 + if ct_mask.any(): + the_mean[ct_mask] = np.nan + else: + the_mean = the_sum / count if count > 0 else np.nan + + return the_mean + + +@bottleneck_switch() +def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None): + """ + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 2, 2]) + >>> nanops.nanmedian(s.values) + 2.0 + """ + # for floats without mask, the data already uses NaN as missing value + # indicator, and `mask` will be calculated from that below -> in those + # cases we never need to set NaN to the masked values + using_nan_sentinel = values.dtype.kind == "f" and mask is None + + def get_median(x, _mask=None): + if _mask is None: + _mask = notna(x) + else: + _mask = ~_mask + if not skipna and not _mask.all(): + return np.nan + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings( + "ignore", "All-NaN slice encountered", RuntimeWarning + ) + res = np.nanmedian(x[_mask]) + return res + + dtype = values.dtype + values, mask = _get_values(values, skipna, mask=mask, fill_value=None) + if values.dtype.kind != "f": + if values.dtype == object: + # GH#34671 avoid casting strings to numeric + inferred = lib.infer_dtype(values) + if inferred in ["string", "mixed"]: + raise TypeError(f"Cannot convert {values} to numeric") + try: + values = values.astype("f8") + except ValueError as err: + # e.g. "could not convert string to float: 'a'" + raise TypeError(str(err)) from err + if not using_nan_sentinel and mask is not None: + if not values.flags.writeable: + values = values.copy() + values[mask] = np.nan + + notempty = values.size + + # an array from a frame + if values.ndim > 1 and axis is not None: + # there's a non-empty array to apply over otherwise numpy raises + if notempty: + if not skipna: + res = np.apply_along_axis(get_median, axis, values) + + else: + # fastpath for the skipna case + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings( + "ignore", "All-NaN slice encountered", RuntimeWarning + ) + if (values.shape[1] == 1 and axis == 0) or ( + values.shape[0] == 1 and axis == 1 + ): + # GH52788: fastpath when squeezable, nanmedian for 2D array slow + res = np.nanmedian(np.squeeze(values), keepdims=True) + else: + res = np.nanmedian(values, axis=axis) + + else: + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + res = _get_empty_reduction_result(values.shape, axis) + + else: + # otherwise return a scalar value + res = get_median(values, mask) if notempty else np.nan + return _wrap_results(res, dtype) + + +def _get_empty_reduction_result( + shape: Shape, + axis: AxisInt, +) -> np.ndarray: + """ + The result from a reduction on an empty ndarray. + + Parameters + ---------- + shape : Tuple[int, ...] + axis : int + + Returns + ------- + np.ndarray + """ + shp = np.array(shape) + dims = np.arange(len(shape)) + ret = np.empty(shp[dims != axis], dtype=np.float64) + ret.fill(np.nan) + return ret + + +def _get_counts_nanvar( + values_shape: Shape, + mask: npt.NDArray[np.bool_] | None, + axis: AxisInt | None, + ddof: int, + dtype: np.dtype = np.dtype(np.float64), +) -> tuple[float | np.ndarray, float | np.ndarray]: + """ + Get the count of non-null values along an axis, accounting + for degrees of freedom. + + Parameters + ---------- + values_shape : Tuple[int, ...] + shape tuple from values ndarray, used if mask is None + mask : Optional[ndarray[bool]] + locations in values that should be considered missing + axis : Optional[int] + axis to count along + ddof : int + degrees of freedom + dtype : type, optional + type to use for count + + Returns + ------- + count : int, np.nan or np.ndarray + d : int, np.nan or np.ndarray + """ + count = _get_counts(values_shape, mask, axis, dtype=dtype) + d = count - dtype.type(ddof) + + # always return NaN, never inf + if is_float(count): + if count <= ddof: + # error: Incompatible types in assignment (expression has type + # "float", variable has type "Union[floating[Any], ndarray[Any, + # dtype[floating[Any]]]]") + count = np.nan # type: ignore[assignment] + d = np.nan + else: + # count is not narrowed by is_float check + count = cast(np.ndarray, count) + mask = count <= ddof + if mask.any(): + np.putmask(d, mask, np.nan) + np.putmask(count, mask, np.nan) + return count, d + + +@bottleneck_switch(ddof=1) +def nanstd( + values, + *, + axis: AxisInt | None = None, + skipna: bool = True, + ddof: int = 1, + mask=None, +): + """ + Compute the standard deviation along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanstd(s.values) + 1.0 + """ + if values.dtype == "M8[ns]": + values = values.view("m8[ns]") + + orig_dtype = values.dtype + values, mask = _get_values(values, skipna, mask=mask) + + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) + return _wrap_results(result, orig_dtype) + + +@disallow("M8", "m8") +@bottleneck_switch(ddof=1) +def nanvar( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + ddof: int = 1, + mask=None, +): + """ + Compute the variance along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanvar(s.values) + 1.0 + """ + dtype = values.dtype + mask = _maybe_get_mask(values, skipna, mask) + if dtype.kind in "iu": + values = values.astype("f8") + if mask is not None: + values[mask] = np.nan + + if values.dtype.kind == "f": + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) + else: + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + + # xref GH10242 + # Compute variance via two-pass algorithm, which is stable against + # cancellation errors and relatively accurate for small numbers of + # observations. + # + # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count + if axis is not None: + avg = np.expand_dims(avg, axis) + sqr = _ensure_numeric((avg - values) ** 2) + if mask is not None: + np.putmask(sqr, mask, 0) + result = sqr.sum(axis=axis, dtype=np.float64) / d + + # Return variance as np.float64 (the datatype used in the accumulator), + # unless we were dealing with a float array, in which case use the same + # precision as the original values array. + if dtype.kind == "f": + result = result.astype(dtype, copy=False) + return result + + +@disallow("M8", "m8") +def nansem( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + ddof: int = 1, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Compute the standard error in the mean along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nansem(s.values) + 0.5773502691896258 + """ + # This checks if non-numeric-like data is passed with numeric_only=False + # and raises a TypeError otherwise + nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) + + mask = _maybe_get_mask(values, skipna, mask) + if values.dtype.kind != "f": + values = values.astype("f8") + + if not skipna and mask is not None and mask.any(): + return np.nan + + count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) + var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) + + return np.sqrt(var) / np.sqrt(count) + + +def _nanminmax(meth, fill_value_typ): + @bottleneck_switch(name=f"nan{meth}") + @_datetimelike_compat + def reduction( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, + ): + if values.size == 0: + return _na_for_min_count(values, axis) + + values, mask = _get_values( + values, skipna, fill_value_typ=fill_value_typ, mask=mask + ) + result = getattr(values, meth)(axis) + result = _maybe_null_out(result, axis, mask, values.shape) + return result + + return reduction + + +nanmin = _nanminmax("min", fill_value_typ="+inf") +nanmax = _nanminmax("max", fill_value_typ="-inf") + + +def nanargmax( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> int | np.ndarray: + """ + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : int or ndarray[int] + The index/indices of max value in specified axis or -1 in the NA case + + Examples + -------- + >>> from pandas.core import nanops + >>> arr = np.array([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmax(arr) + 4 + + >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) + >>> arr[2:, 2] = np.nan + >>> arr + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., nan], + [ 9., 10., nan]]) + >>> nanops.nanargmax(arr, axis=1) + array([2, 2, 1, 1]) + """ + values, mask = _get_values(values, True, fill_value_typ="-inf", mask=mask) + result = values.argmax(axis) + # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any | + # signedinteger[Any]"; expected "ndarray[Any, Any]" + result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type] + return result + + +def nanargmin( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> int | np.ndarray: + """ + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : int or ndarray[int] + The index/indices of min value in specified axis or -1 in the NA case + + Examples + -------- + >>> from pandas.core import nanops + >>> arr = np.array([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmin(arr) + 0 + + >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) + >>> arr[2:, 0] = np.nan + >>> arr + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [nan, 7., 8.], + [nan, 10., 11.]]) + >>> nanops.nanargmin(arr, axis=1) + array([0, 0, 1, 1]) + """ + values, mask = _get_values(values, True, fill_value_typ="+inf", mask=mask) + result = values.argmin(axis) + # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any | + # signedinteger[Any]"; expected "ndarray[Any, Any]" + result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type] + return result + + +@disallow("M8", "m8") +@maybe_operate_rowwise +def nanskew( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Compute the sample skewness. + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G1. The algorithm computes this coefficient directly + from the second and third central moment. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 1, 2]) + >>> nanops.nanskew(s.values) + 1.7320508075688787 + """ + mask = _maybe_get_mask(values, skipna, mask) + if values.dtype.kind != "f": + values = values.astype("f8") + count = _get_counts(values.shape, mask, axis) + else: + count = _get_counts(values.shape, mask, axis, dtype=values.dtype) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + elif not skipna and mask is not None and mask.any(): + return np.nan + + with np.errstate(invalid="ignore", divide="ignore"): + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna and mask is not None: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted**2 + adjusted3 = adjusted2 * adjusted + m2 = adjusted2.sum(axis, dtype=np.float64) + m3 = adjusted3.sum(axis, dtype=np.float64) + + # floating point error + # + # #18044 in _libs/windows.pyx calc_skew follow this behavior + # to fix the fperr to treat m2 <1e-14 as zero + m2 = _zero_out_fperr(m2) + m3 = _zero_out_fperr(m3) + + with np.errstate(invalid="ignore", divide="ignore"): + result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5) + + dtype = values.dtype + if dtype.kind == "f": + result = result.astype(dtype, copy=False) + + if isinstance(result, np.ndarray): + result = np.where(m2 == 0, 0, result) + result[count < 3] = np.nan + else: + result = dtype.type(0) if m2 == 0 else result + if count < 3: + return np.nan + + return result + + +@disallow("M8", "m8") +@maybe_operate_rowwise +def nankurt( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Compute the sample excess kurtosis + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G2, computed directly from the second and fourth + central moment. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, np.nan, 1, 3, 2]) + >>> nanops.nankurt(s.values) + -1.2892561983471076 + """ + mask = _maybe_get_mask(values, skipna, mask) + if values.dtype.kind != "f": + values = values.astype("f8") + count = _get_counts(values.shape, mask, axis) + else: + count = _get_counts(values.shape, mask, axis, dtype=values.dtype) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + elif not skipna and mask is not None and mask.any(): + return np.nan + + with np.errstate(invalid="ignore", divide="ignore"): + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna and mask is not None: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted**2 + adjusted4 = adjusted2**2 + m2 = adjusted2.sum(axis, dtype=np.float64) + m4 = adjusted4.sum(axis, dtype=np.float64) + + with np.errstate(invalid="ignore", divide="ignore"): + adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + numerator = count * (count + 1) * (count - 1) * m4 + denominator = (count - 2) * (count - 3) * m2**2 + + # floating point error + # + # #18044 in _libs/windows.pyx calc_kurt follow this behavior + # to fix the fperr to treat denom <1e-14 as zero + numerator = _zero_out_fperr(numerator) + denominator = _zero_out_fperr(denominator) + + if not isinstance(denominator, np.ndarray): + # if ``denom`` is a scalar, check these corner cases first before + # doing division + if count < 4: + return np.nan + if denominator == 0: + return values.dtype.type(0) + + with np.errstate(invalid="ignore", divide="ignore"): + result = numerator / denominator - adj + + dtype = values.dtype + if dtype.kind == "f": + result = result.astype(dtype, copy=False) + + if isinstance(result, np.ndarray): + result = np.where(denominator == 0, 0, result) + result[count < 4] = np.nan + + return result + + +@disallow("M8", "m8") +@maybe_operate_rowwise +def nanprod( + values: np.ndarray, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + mask: npt.NDArray[np.bool_] | None = None, +) -> float: + """ + Parameters + ---------- + values : ndarray[dtype] + axis : int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) + + Examples + -------- + >>> from pandas.core import nanops + >>> s = pd.Series([1, 2, 3, np.nan]) + >>> nanops.nanprod(s.values) + 6.0 + """ + mask = _maybe_get_mask(values, skipna, mask) + + if skipna and mask is not None: + values = values.copy() + values[mask] = 1 + result = values.prod(axis) + # error: Incompatible return value type (got "Union[ndarray, float]", expected + # "float") + return _maybe_null_out( # type: ignore[return-value] + result, axis, mask, values.shape, min_count=min_count + ) + + +def _maybe_arg_null_out( + result: np.ndarray, + axis: AxisInt | None, + mask: npt.NDArray[np.bool_] | None, + skipna: bool, +) -> np.ndarray | int: + # helper function for nanargmin/nanargmax + if mask is None: + return result + + if axis is None or not getattr(result, "ndim", False): + if skipna: + if mask.all(): + return -1 + else: + if mask.any(): + return -1 + else: + if skipna: + na_mask = mask.all(axis) + else: + na_mask = mask.any(axis) + if na_mask.any(): + result[na_mask] = -1 + return result + + +def _get_counts( + values_shape: Shape, + mask: npt.NDArray[np.bool_] | None, + axis: AxisInt | None, + dtype: np.dtype[np.floating] = np.dtype(np.float64), +) -> np.floating | npt.NDArray[np.floating]: + """ + Get the count of non-null values along an axis + + Parameters + ---------- + values_shape : tuple of int + shape tuple from values ndarray, used if mask is None + mask : Optional[ndarray[bool]] + locations in values that should be considered missing + axis : Optional[int] + axis to count along + dtype : type, optional + type to use for count + + Returns + ------- + count : scalar or array + """ + if axis is None: + if mask is not None: + n = mask.size - mask.sum() + else: + n = np.prod(values_shape) + return dtype.type(n) + + if mask is not None: + count = mask.shape[axis] - mask.sum(axis) + else: + count = values_shape[axis] + + if is_integer(count): + return dtype.type(count) + return count.astype(dtype, copy=False) + + +def _maybe_null_out( + result: np.ndarray | float | NaTType, + axis: AxisInt | None, + mask: npt.NDArray[np.bool_] | None, + shape: tuple[int, ...], + min_count: int = 1, +) -> np.ndarray | float | NaTType: + """ + Returns + ------- + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) + """ + if mask is None and min_count == 0: + # nothing to check; short-circuit + return result + + if axis is not None and isinstance(result, np.ndarray): + if mask is not None: + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + else: + # we have no nulls, kept mask=None in _maybe_get_mask + below_count = shape[axis] - min_count < 0 + new_shape = shape[:axis] + shape[axis + 1 :] + null_mask = np.broadcast_to(below_count, new_shape) + + if np.any(null_mask): + if is_numeric_dtype(result): + if np.iscomplexobj(result): + result = result.astype("c16") + elif not is_float_dtype(result): + result = result.astype("f8", copy=False) + result[null_mask] = np.nan + else: + # GH12941, use None to auto cast null + result[null_mask] = None + elif result is not NaT: + if check_below_min_count(shape, mask, min_count): + result_dtype = getattr(result, "dtype", None) + if is_float_dtype(result_dtype): + # error: Item "None" of "Optional[Any]" has no attribute "type" + result = result_dtype.type("nan") # type: ignore[union-attr] + else: + result = np.nan + + return result + + +def check_below_min_count( + shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int +) -> bool: + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + missing value should be returned from the reduction). + + Parameters + ---------- + shape : tuple + The shape of the values (`values.shape`). + mask : ndarray[bool] or None + Boolean numpy array (typically of same shape as `shape`) or None. + min_count : int + Keyword passed through from sum/prod call. + + Returns + ------- + bool + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = np.prod(shape) + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False + + +def _zero_out_fperr(arg): + # #18044 reference this behavior to fix rolling skew/kurt issue + if isinstance(arg, np.ndarray): + return np.where(np.abs(arg) < 1e-14, 0, arg) + else: + return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg + + +@disallow("M8", "m8") +def nancorr( + a: np.ndarray, + b: np.ndarray, + *, + method: CorrelationMethod = "pearson", + min_periods: int | None = None, +) -> float: + """ + a, b: ndarrays + """ + if len(a) != len(b): + raise AssertionError("Operands to nancorr must have same size") + + if min_periods is None: + min_periods = 1 + + valid = notna(a) & notna(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + a = _ensure_numeric(a) + b = _ensure_numeric(b) + + f = get_corr_func(method) + return f(a, b) + + +def get_corr_func( + method: CorrelationMethod, +) -> Callable[[np.ndarray, np.ndarray], float]: + if method == "kendall": + from scipy.stats import kendalltau + + def func(a, b): + return kendalltau(a, b)[0] + + return func + elif method == "spearman": + from scipy.stats import spearmanr + + def func(a, b): + return spearmanr(a, b)[0] + + return func + elif method == "pearson": + + def func(a, b): + return np.corrcoef(a, b)[0, 1] + + return func + elif callable(method): + return method + + raise ValueError( + f"Unknown method '{method}', expected one of " + "'kendall', 'spearman', 'pearson', or callable" + ) + + +@disallow("M8", "m8") +def nancov( + a: np.ndarray, + b: np.ndarray, + *, + min_periods: int | None = None, + ddof: int | None = 1, +) -> float: + if len(a) != len(b): + raise AssertionError("Operands to nancov must have same size") + + if min_periods is None: + min_periods = 1 + + valid = notna(a) & notna(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + a = _ensure_numeric(a) + b = _ensure_numeric(b) + + return np.cov(a, b, ddof=ddof)[0, 1] + + +def _ensure_numeric(x): + if isinstance(x, np.ndarray): + if x.dtype.kind in "biu": + x = x.astype(np.float64) + elif x.dtype == object: + inferred = lib.infer_dtype(x) + if inferred in ["string", "mixed"]: + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert {x} to numeric") + try: + x = x.astype(np.complex128) + except (TypeError, ValueError): + try: + x = x.astype(np.float64) + except ValueError as err: + # GH#29941 we get here with object arrays containing strs + raise TypeError(f"Could not convert {x} to numeric") from err + else: + if not np.any(np.imag(x)): + x = x.real + elif not (is_float(x) or is_integer(x) or is_complex(x)): + if isinstance(x, str): + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert string '{x}' to numeric") + try: + x = float(x) + except (TypeError, ValueError): + # e.g. "1+1j" or "foo" + try: + x = complex(x) + except ValueError as err: + # e.g. "foo" + raise TypeError(f"Could not convert {x} to numeric") from err + return x + + +def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: + """ + Cumulative function with skipna support. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate} + skipna : bool + + Returns + ------- + np.ndarray or ExtensionArray + """ + mask_a, mask_b = { + np.cumprod: (1.0, np.nan), + np.maximum.accumulate: (-np.inf, np.nan), + np.cumsum: (0.0, np.nan), + np.minimum.accumulate: (np.inf, np.nan), + }[accum_func] + + # This should go through ea interface + assert values.dtype.kind not in "mM" + + # We will be applying this function to block values + if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + vals = values.copy() + mask = isna(vals) + vals[mask] = mask_a + result = accum_func(vals, axis=0) + result[mask] = mask_b + else: + result = accum_func(values, axis=0) + + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/resample.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..0dd808a0ab296ba0979d2f3ec8c8e847cd5999fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/resample.py @@ -0,0 +1,2920 @@ +from __future__ import annotations + +import copy +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, + final, + no_type_check, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import ( + BaseOffset, + IncompatibleFrequency, + NaT, + Period, + Timedelta, + Timestamp, + to_offset, +) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas._typing import NDFrameT +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) + +from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +import pandas.core.algorithms as algos +from pandas.core.apply import ( + ResamplerWindowApply, + warn_alias_replacement, +) +from pandas.core.arrays import ArrowExtensionArray +from pandas.core.base import ( + PandasObject, + SelectionMixin, +) +import pandas.core.common as com +from pandas.core.generic import ( + NDFrame, + _shared_docs, +) +from pandas.core.groupby.generic import SeriesGroupBy +from pandas.core.groupby.groupby import ( + BaseGroupBy, + GroupBy, + _apply_groupings_depr, + _pipe_template, + get_groupby, +) +from pandas.core.groupby.grouper import Grouper +from pandas.core.groupby.ops import BinGrouper +from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index +from pandas.core.indexes.datetimes import ( + DatetimeIndex, + date_range, +) +from pandas.core.indexes.period import ( + PeriodIndex, + period_range, +) +from pandas.core.indexes.timedeltas import ( + TimedeltaIndex, + timedelta_range, +) + +from pandas.tseries.frequencies import ( + is_subperiod, + is_superperiod, +) +from pandas.tseries.offsets import ( + Day, + Tick, +) + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import ( + AnyArrayLike, + Axis, + AxisInt, + Frequency, + IndexLabel, + InterpolateOptions, + T, + TimedeltaConvertibleTypes, + TimeGrouperOrigin, + TimestampConvertibleTypes, + npt, + ) + + from pandas import ( + DataFrame, + Series, + ) + +_shared_docs_kwargs: dict[str, str] = {} + + +class Resampler(BaseGroupBy, PandasObject): + """ + Class for resampling datetimelike data, a groupby-like operation. + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.resample(...) to use Resampler. + + Parameters + ---------- + obj : Series or DataFrame + groupby : TimeGrouper + axis : int, default 0 + kind : str or None + 'period', 'timestamp' to override default index treatment + + Returns + ------- + a Resampler of the appropriate type + + Notes + ----- + After resampling, see aggregate, apply, and transform functions. + """ + + _grouper: BinGrouper + _timegrouper: TimeGrouper + binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass + exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat + _internal_names_set = set({"obj", "ax", "_indexer"}) + + # to the groupby descriptor + _attributes = [ + "freq", + "axis", + "closed", + "label", + "convention", + "kind", + "origin", + "offset", + ] + + def __init__( + self, + obj: NDFrame, + timegrouper: TimeGrouper, + axis: Axis = 0, + kind=None, + *, + gpr_index: Index, + group_keys: bool = False, + selection=None, + include_groups: bool = True, + ) -> None: + self._timegrouper = timegrouper + self.keys = None + self.sort = True + self.axis = obj._get_axis_number(axis) + self.kind = kind + self.group_keys = group_keys + self.as_index = True + self.include_groups = include_groups + + self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( + self._convert_obj(obj), sort=True, gpr_index=gpr_index + ) + self.binner, self._grouper = self._get_binner() + self._selection = selection + if self._timegrouper.key is not None: + self.exclusions = frozenset([self._timegrouper.key]) + else: + self.exclusions = frozenset() + + @final + def __str__(self) -> str: + """ + Provide a nice str repr of our rolling object. + """ + attrs = ( + f"{k}={getattr(self._timegrouper, k)}" + for k in self._attributes + if getattr(self._timegrouper, k, None) is not None + ) + return f"{type(self).__name__} [{', '.join(attrs)}]" + + @final + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self._attributes: + return getattr(self._timegrouper, attr) + if attr in self.obj: + return self[attr] + + return object.__getattribute__(self, attr) + + @final + @property + def _from_selection(self) -> bool: + """ + Is the resampling from a DataFrame column or MultiIndex level. + """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return self._timegrouper is not None and ( + self._timegrouper.key is not None or self._timegrouper.level is not None + ) + + def _convert_obj(self, obj: NDFrameT) -> NDFrameT: + """ + Provide any conversions for the object in order to correctly handle. + + Parameters + ---------- + obj : Series or DataFrame + + Returns + ------- + Series or DataFrame + """ + return obj._consolidate() + + def _get_binner_for_time(self): + raise AbstractMethodError(self) + + @final + def _get_binner(self): + """ + Create the BinGrouper, assume that self.set_grouper(obj) + has already been called. + """ + binner, bins, binlabels = self._get_binner_for_time() + assert len(bins) == len(binlabels) + bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer) + return binner, bin_grouper + + @final + @Substitution( + klass="Resampler", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1""", + ) + @Appender(_pipe_template) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + return super().pipe(func, *args, **kwargs) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.groupby.aggregate : Aggregate using callable, string, dict, + or list of string/callables. + DataFrame.resample.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate: Aggregate using one or more + operations over the specified axis. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4, 5], + ... index=pd.date_range('20130101', periods=5, freq='s')) + >>> s + 2013-01-01 00:00:00 1 + 2013-01-01 00:00:01 2 + 2013-01-01 00:00:02 3 + 2013-01-01 00:00:03 4 + 2013-01-01 00:00:04 5 + Freq: s, dtype: int64 + + >>> r = s.resample('2s') + + >>> r.agg("sum") + 2013-01-01 00:00:00 3 + 2013-01-01 00:00:02 7 + 2013-01-01 00:00:04 5 + Freq: 2s, dtype: int64 + + >>> r.agg(['sum', 'mean', 'max']) + sum mean max + 2013-01-01 00:00:00 3 1.5 2 + 2013-01-01 00:00:02 7 3.5 4 + 2013-01-01 00:00:04 5 5.0 5 + + >>> r.agg({'result': lambda x: x.mean() / x.std(), + ... 'total': "sum"}) + result total + 2013-01-01 00:00:00 2.121320 3 + 2013-01-01 00:00:02 4.949747 7 + 2013-01-01 00:00:04 NaN 5 + + >>> r.agg(average="mean", total="sum") + average total + 2013-01-01 00:00:00 1.5 3 + 2013-01-01 00:00:02 3.5 7 + 2013-01-01 00:00:04 5.0 5 + """ + ) + + @final + @doc( + _shared_docs["aggregate"], + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + klass="DataFrame", + axis="", + ) + def aggregate(self, func=None, *args, **kwargs): + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if result is None: + how = func + result = self._groupby_and_aggregate(how, *args, **kwargs) + + return result + + agg = aggregate + apply = aggregate + + @final + def transform(self, arg, *args, **kwargs): + """ + Call function producing a like-indexed Series on each group. + + Return a Series with the transformed values. + + Parameters + ---------- + arg : function + To apply to each group. Should return a Series with the same index. + + Returns + ------- + Series + + Examples + -------- + >>> s = pd.Series([1, 2], + ... index=pd.date_range('20180101', + ... periods=2, + ... freq='1h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + Freq: h, dtype: int64 + + >>> resampled = s.resample('15min') + >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) + 2018-01-01 00:00:00 NaN + 2018-01-01 01:00:00 NaN + Freq: h, dtype: float64 + """ + return self._selected_obj.groupby(self._timegrouper).transform( + arg, *args, **kwargs + ) + + def _downsample(self, f, **kwargs): + raise AbstractMethodError(self) + + def _upsample(self, f, limit: int | None = None, fill_value=None): + raise AbstractMethodError(self) + + def _gotitem(self, key, ndim: int, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + grouper = self._grouper + if subset is None: + subset = self.obj + if key is not None: + subset = subset[key] + else: + # reached via Apply.agg_dict_like with selection=None and ndim=1 + assert subset.ndim == 1 + if ndim == 1: + assert subset.ndim == 1 + + grouped = get_groupby( + subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys + ) + return grouped + + def _groupby_and_aggregate(self, how, *args, **kwargs): + """ + Re-evaluate the obj with a groupby aggregation. + """ + grouper = self._grouper + + # Excludes `on` column when provided + obj = self._obj_with_exclusions + + grouped = get_groupby( + obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys + ) + + try: + if callable(how): + # TODO: test_resample_apply_with_additional_args fails if we go + # through the non-lambda path, not clear that it should. + func = lambda x: how(x, *args, **kwargs) + result = grouped.aggregate(func) + else: + result = grouped.aggregate(how, *args, **kwargs) + except (AttributeError, KeyError): + # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input + + # test_apply_to_one_column_of_df the function being applied references + # a DataFrame column, but aggregate_item_by_item operates column-wise + # on Series, raising AttributeError or KeyError + # (depending on whether the column lookup uses getattr/__getitem__) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) + + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + # see test_apply_without_aggregation, test_apply_with_mutated_index + pass + else: + raise + + # we have a non-reducing function + # try to evaluate + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) + + return self._wrap_result(result) + + @final + def _get_resampler_for_grouping( + self, groupby: GroupBy, key, include_groups: bool = True + ): + """ + Return the correct class for resampling with groupby. + """ + return self._resampler_for_grouping( + groupby=groupby, key=key, parent=self, include_groups=include_groups + ) + + def _wrap_result(self, result): + """ + Potentially wrap any results. + """ + # GH 47705 + obj = self.obj + if ( + isinstance(result, ABCDataFrame) + and len(result) == 0 + and not isinstance(result.index, PeriodIndex) + ): + result = result.set_index( + _asfreq_compat(obj.index[:0], freq=self.freq), append=True + ) + + if isinstance(result, ABCSeries) and self._selection is not None: + result.name = self._selection + + if isinstance(result, ABCSeries) and result.empty: + # When index is all NaT, result is empty but index is not + result.index = _asfreq_compat(obj.index[:0], freq=self.freq) + result.name = getattr(obj, "name", None) + + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + + return result + + @final + def ffill(self, limit: int | None = None): + """ + Forward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + An upsampled Series. + + See Also + -------- + Series.fillna: Fill NA/NaN values using the specified method. + DataFrame.fillna: Fill NA/NaN values using the specified method. + + Examples + -------- + Here we only create a ``Series``. + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + + Example for ``ffill`` with downsampling (we have fewer dates after resampling): + + >>> ser.resample('MS').ffill() + 2023-01-01 1 + 2023-02-01 3 + Freq: MS, dtype: int64 + + Example for ``ffill`` with upsampling (fill the new dates with + the previous value): + + >>> ser.resample('W').ffill() + 2023-01-01 1 + 2023-01-08 1 + 2023-01-15 2 + 2023-01-22 2 + 2023-01-29 2 + 2023-02-05 3 + 2023-02-12 3 + 2023-02-19 4 + Freq: W-SUN, dtype: int64 + + With upsampling and limiting (only fill the first new date with the + previous value): + + >>> ser.resample('W').ffill(limit=1) + 2023-01-01 1.0 + 2023-01-08 1.0 + 2023-01-15 2.0 + 2023-01-22 2.0 + 2023-01-29 NaN + 2023-02-05 3.0 + 2023-02-12 NaN + 2023-02-19 4.0 + Freq: W-SUN, dtype: float64 + """ + return self._upsample("ffill", limit=limit) + + @final + def nearest(self, limit: int | None = None): + """ + Resample by using the nearest value. + + When resampling data, missing values may appear (e.g., when the + resampling frequency is higher than the original frequency). + The `nearest` method will replace ``NaN`` values that appeared in + the resampled data with the value from the nearest member of the + sequence, based on the index value. + Missing values that existed in the original data will not be modified. + If `limit` is given, fill only this many values in each direction for + each of the original values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + An upsampled Series or DataFrame with ``NaN`` values filled with + their nearest value. + + See Also + -------- + backfill : Backward fill the new missing values in the resampled data. + pad : Forward fill ``NaN`` values. + + Examples + -------- + >>> s = pd.Series([1, 2], + ... index=pd.date_range('20180101', + ... periods=2, + ... freq='1h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + Freq: h, dtype: int64 + + >>> s.resample('15min').nearest() + 2018-01-01 00:00:00 1 + 2018-01-01 00:15:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 00:45:00 2 + 2018-01-01 01:00:00 2 + Freq: 15min, dtype: int64 + + Limit the number of upsampled values imputed by the nearest: + + >>> s.resample('15min').nearest(limit=1) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + Freq: 15min, dtype: float64 + """ + return self._upsample("nearest", limit=limit) + + @final + def bfill(self, limit: int | None = None): + """ + Backward fill the new missing values in the resampled data. + + In statistics, imputation is the process of replacing missing data with + substituted values [1]_. When resampling data, missing values may + appear (e.g., when the resampling frequency is higher than the original + frequency). The backward fill will replace NaN values that appeared in + the resampled data with the next value in the original sequence. + Missing values that existed in the original data will not be modified. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series, DataFrame + An upsampled Series or DataFrame with backward filled NaN values. + + See Also + -------- + bfill : Alias of backfill. + fillna : Fill NaN values using the specified method, which can be + 'backfill'. + nearest : Fill NaN values with nearest neighbor starting from center. + ffill : Forward fill NaN values. + Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'backfill'. + DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'backfill'. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + + Examples + -------- + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: h, dtype: int64 + + >>> s.resample('30min').bfill() + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30min, dtype: int64 + + >>> s.resample('15min').bfill(limit=2) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 NaN + 2018-01-01 00:30:00 2.0 + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:15:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 01:45:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 15min, dtype: float64 + + Resampling a DataFrame that has missing values: + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('30min').bfill() + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('15min').bfill(limit=2) + a b + 2018-01-01 00:00:00 2.0 1.0 + 2018-01-01 00:15:00 NaN NaN + 2018-01-01 00:30:00 NaN 3.0 + 2018-01-01 00:45:00 NaN 3.0 + 2018-01-01 01:00:00 NaN 3.0 + 2018-01-01 01:15:00 NaN NaN + 2018-01-01 01:30:00 6.0 5.0 + 2018-01-01 01:45:00 6.0 5.0 + 2018-01-01 02:00:00 6.0 5.0 + """ + return self._upsample("bfill", limit=limit) + + @final + def fillna(self, method, limit: int | None = None): + """ + Fill missing values introduced by upsampling. + + In statistics, imputation is the process of replacing missing data with + substituted values [1]_. When resampling data, missing values may + appear (e.g., when the resampling frequency is higher than the original + frequency). + + Missing values that existed in the original data will + not be modified. + + Parameters + ---------- + method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} + Method to use for filling holes in resampled data + + * 'pad' or 'ffill': use previous valid observation to fill gap + (forward fill). + * 'backfill' or 'bfill': use next valid observation to fill gap. + * 'nearest': use nearest valid observation to fill gap. + + limit : int, optional + Limit of how many consecutive missing values to fill. + + Returns + ------- + Series or DataFrame + An upsampled Series or DataFrame with missing values filled. + + See Also + -------- + bfill : Backward fill NaN values in the resampled data. + ffill : Forward fill NaN values in the resampled data. + nearest : Fill NaN values in the resampled data + with nearest neighbor starting from center. + interpolate : Fill NaN values using interpolation. + Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'bfill' and 'ffill'. + DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'bfill' and 'ffill'. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + + Examples + -------- + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: h, dtype: int64 + + Without filling the missing values you get: + + >>> s.resample("30min").asfreq() + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30min, dtype: float64 + + >>> s.resample('30min').fillna("backfill") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30min, dtype: int64 + + >>> s.resample('15min').fillna("backfill", limit=2) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 NaN + 2018-01-01 00:30:00 2.0 + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:15:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 01:45:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 15min, dtype: float64 + + >>> s.resample('30min').fillna("pad") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 2 + 2018-01-01 02:00:00 3 + Freq: 30min, dtype: int64 + + >>> s.resample('30min').fillna("nearest") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30min, dtype: int64 + + Missing values present before the upsampling are not affected. + + >>> sm = pd.Series([1, None, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> sm + 2018-01-01 00:00:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: h, dtype: float64 + + >>> sm.resample('30min').fillna('backfill') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30min, dtype: float64 + + >>> sm.resample('30min').fillna('pad') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30min, dtype: float64 + + >>> sm.resample('30min').fillna('nearest') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30min, dtype: float64 + + DataFrame resampling is done column-wise. All the same options are + available. + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('30min').fillna("bfill") + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + """ + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and will be removed " + "in a future version. Use obj.ffill(), obj.bfill(), " + "or obj.nearest() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._upsample(method, limit=limit) + + @final + def interpolate( + self, + method: InterpolateOptions = "linear", + *, + axis: Axis = 0, + limit: int | None = None, + inplace: bool = False, + limit_direction: Literal["forward", "backward", "both"] = "forward", + limit_area=None, + downcast=lib.no_default, + **kwargs, + ): + """ + Interpolate values between target timestamps according to different methods. + + The original index is first reindexed to target timestamps + (see :meth:`core.resample.Resampler.asfreq`), + then the interpolation of ``NaN`` values via :meth:`DataFrame.interpolate` + happens. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + + * 'linear': Ignore the index and treat the values as equally + spaced. This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'pad': Fill in NaNs using existing values. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'polynomial': Passed to + `scipy.interpolate.interp1d`, whereas 'spline' is passed to + `scipy.interpolate.UnivariateSpline`. These methods use the numerical + values of the index. Both 'polynomial' and 'spline' require that + you also specify an `order` (int), e.g. + ``df.interpolate(method='polynomial', order=5)``. Note that, + `slinear` method in Pandas refers to the Scipy first order `spline` + instead of Pandas first order `spline`. + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods of + similar names. See `Notes`. + * 'from_derivatives': Refers to + `scipy.interpolate.BPoly.from_derivatives`. + + axis : {{0 or 'index', 1 or 'columns', None}}, default None + Axis to interpolate along. For `Series` this parameter is unused + and defaults to 0. + limit : int, optional + Maximum number of consecutive NaNs to fill. Must be greater than + 0. + inplace : bool, default False + Update the data in place if possible. + limit_direction : {{'forward', 'backward', 'both'}}, Optional + Consecutive NaNs will be filled in this direction. + + If limit is specified: + * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. + * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be + 'backwards'. + + If 'limit' is not specified: + * If 'method' is 'backfill' or 'bfill', the default is 'backward' + * else the default is 'forward' + + raises ValueError if `limit_direction` is 'forward' or 'both' and + method is 'backfill' or 'bfill'. + raises ValueError if `limit_direction` is 'backward' or 'both' and + method is 'pad' or 'ffill'. + + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + downcast : optional, 'infer' or None, defaults to None + Downcast dtypes if possible. + + .. deprecated:: 2.1.0 + + ``**kwargs`` : optional + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + DataFrame or Series + Interpolated values at the specified freq. + + See Also + -------- + core.resample.Resampler.asfreq: Return the values at the new freq, + essentially a reindex. + DataFrame.interpolate: Fill NaN values using an interpolation method. + + Notes + ----- + For high-frequent or non-equidistant time-series with timestamps + the reindexing followed by interpolation may lead to information loss + as shown in the last example. + + Examples + -------- + + >>> start = "2023-03-01T07:00:00" + >>> timesteps = pd.date_range(start, periods=5, freq="s") + >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) + >>> series + 2023-03-01 07:00:00 1 + 2023-03-01 07:00:01 -1 + 2023-03-01 07:00:02 2 + 2023-03-01 07:00:03 1 + 2023-03-01 07:00:04 3 + Freq: s, dtype: int64 + + Upsample the dataframe to 0.5Hz by providing the period time of 2s. + + >>> series.resample("2s").interpolate("linear") + 2023-03-01 07:00:00 1 + 2023-03-01 07:00:02 2 + 2023-03-01 07:00:04 3 + Freq: 2s, dtype: int64 + + Downsample the dataframe to 2Hz by providing the period time of 500ms. + + >>> series.resample("500ms").interpolate("linear") + 2023-03-01 07:00:00.000 1.0 + 2023-03-01 07:00:00.500 0.0 + 2023-03-01 07:00:01.000 -1.0 + 2023-03-01 07:00:01.500 0.5 + 2023-03-01 07:00:02.000 2.0 + 2023-03-01 07:00:02.500 1.5 + 2023-03-01 07:00:03.000 1.0 + 2023-03-01 07:00:03.500 2.0 + 2023-03-01 07:00:04.000 3.0 + Freq: 500ms, dtype: float64 + + Internal reindexing with ``asfreq()`` prior to interpolation leads to + an interpolated timeseries on the basis the reindexed timestamps (anchors). + Since not all datapoints from original series become anchors, + it can lead to misleading interpolation results as in the following example: + + >>> series.resample("400ms").interpolate("linear") + 2023-03-01 07:00:00.000 1.0 + 2023-03-01 07:00:00.400 1.2 + 2023-03-01 07:00:00.800 1.4 + 2023-03-01 07:00:01.200 1.6 + 2023-03-01 07:00:01.600 1.8 + 2023-03-01 07:00:02.000 2.0 + 2023-03-01 07:00:02.400 2.2 + 2023-03-01 07:00:02.800 2.4 + 2023-03-01 07:00:03.200 2.6 + 2023-03-01 07:00:03.600 2.8 + 2023-03-01 07:00:04.000 3.0 + Freq: 400ms, dtype: float64 + + Note that the series erroneously increases between two anchors + ``07:00:00`` and ``07:00:02``. + """ + assert downcast is lib.no_default # just checking coverage + result = self._upsample("asfreq") + return result.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, + ) + + @final + def asfreq(self, fill_value=None): + """ + Return the values at the new freq, essentially a reindex. + + Parameters + ---------- + fill_value : scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + Returns + ------- + DataFrame or Series + Values at the specified freq. + + See Also + -------- + Series.asfreq: Convert TimeSeries to specified frequency. + DataFrame.asfreq: Convert TimeSeries to specified frequency. + + Examples + -------- + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-31', '2023-02-01', '2023-02-28'])) + >>> ser + 2023-01-01 1 + 2023-01-31 2 + 2023-02-01 3 + 2023-02-28 4 + dtype: int64 + >>> ser.resample('MS').asfreq() + 2023-01-01 1 + 2023-02-01 3 + Freq: MS, dtype: int64 + """ + return self._upsample("asfreq", fill_value=fill_value) + + @final + def sum( + self, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """ + Compute sum of group values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + Computed sum of values within each group. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').sum() + 2023-01-01 3 + 2023-02-01 7 + Freq: MS, dtype: int64 + """ + maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs) + nv.validate_resampler_func("sum", args, kwargs) + return self._downsample("sum", numeric_only=numeric_only, min_count=min_count) + + @final + def prod( + self, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """ + Compute prod of group values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + Computed prod of values within each group. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').prod() + 2023-01-01 2 + 2023-02-01 12 + Freq: MS, dtype: int64 + """ + maybe_warn_args_and_kwargs(type(self), "prod", args, kwargs) + nv.validate_resampler_func("prod", args, kwargs) + return self._downsample("prod", numeric_only=numeric_only, min_count=min_count) + + @final + def min( + self, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """ + Compute min value of group. + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').min() + 2023-01-01 1 + 2023-02-01 3 + Freq: MS, dtype: int64 + """ + + maybe_warn_args_and_kwargs(type(self), "min", args, kwargs) + nv.validate_resampler_func("min", args, kwargs) + return self._downsample("min", numeric_only=numeric_only, min_count=min_count) + + @final + def max( + self, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """ + Compute max value of group. + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').max() + 2023-01-01 2 + 2023-02-01 4 + Freq: MS, dtype: int64 + """ + maybe_warn_args_and_kwargs(type(self), "max", args, kwargs) + nv.validate_resampler_func("max", args, kwargs) + return self._downsample("max", numeric_only=numeric_only, min_count=min_count) + + @final + @doc(GroupBy.first) + def first( + self, + numeric_only: bool = False, + min_count: int = 0, + skipna: bool = True, + *args, + **kwargs, + ): + maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) + nv.validate_resampler_func("first", args, kwargs) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) + + @final + @doc(GroupBy.last) + def last( + self, + numeric_only: bool = False, + min_count: int = 0, + skipna: bool = True, + *args, + **kwargs, + ): + maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) + nv.validate_resampler_func("last", args, kwargs) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) + + @final + @doc(GroupBy.median) + def median(self, numeric_only: bool = False, *args, **kwargs): + maybe_warn_args_and_kwargs(type(self), "median", args, kwargs) + nv.validate_resampler_func("median", args, kwargs) + return self._downsample("median", numeric_only=numeric_only) + + @final + def mean( + self, + numeric_only: bool = False, + *args, + **kwargs, + ): + """ + Compute mean of groups, excluding missing values. + + Parameters + ---------- + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + DataFrame or Series + Mean of values within each group. + + Examples + -------- + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').mean() + 2023-01-01 1.5 + 2023-02-01 3.5 + Freq: MS, dtype: float64 + """ + maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs) + nv.validate_resampler_func("mean", args, kwargs) + return self._downsample("mean", numeric_only=numeric_only) + + @final + def std( + self, + ddof: int = 1, + numeric_only: bool = False, + *args, + **kwargs, + ): + """ + Compute standard deviation of groups, excluding missing values. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + DataFrame or Series + Standard deviation of values within each group. + + Examples + -------- + + >>> ser = pd.Series([1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').std() + 2023-01-01 1.000000 + 2023-02-01 2.645751 + Freq: MS, dtype: float64 + """ + maybe_warn_args_and_kwargs(type(self), "std", args, kwargs) + nv.validate_resampler_func("std", args, kwargs) + return self._downsample("std", ddof=ddof, numeric_only=numeric_only) + + @final + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + *args, + **kwargs, + ): + """ + Compute variance of groups, excluding missing values. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + DataFrame or Series + Variance of values within each group. + + Examples + -------- + + >>> ser = pd.Series([1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').var() + 2023-01-01 1.0 + 2023-02-01 7.0 + Freq: MS, dtype: float64 + + >>> ser.resample('MS').var(ddof=0) + 2023-01-01 0.666667 + 2023-02-01 4.666667 + Freq: MS, dtype: float64 + """ + maybe_warn_args_and_kwargs(type(self), "var", args, kwargs) + nv.validate_resampler_func("var", args, kwargs) + return self._downsample("var", ddof=ddof, numeric_only=numeric_only) + + @final + @doc(GroupBy.sem) + def sem( + self, + ddof: int = 1, + numeric_only: bool = False, + *args, + **kwargs, + ): + maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs) + nv.validate_resampler_func("sem", args, kwargs) + return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) + + @final + @doc(GroupBy.ohlc) + def ohlc( + self, + *args, + **kwargs, + ): + maybe_warn_args_and_kwargs(type(self), "ohlc", args, kwargs) + nv.validate_resampler_func("ohlc", args, kwargs) + + ax = self.ax + obj = self._obj_with_exclusions + if len(ax) == 0: + # GH#42902 + obj = obj.copy() + obj.index = _asfreq_compat(obj.index, self.freq) + if obj.ndim == 1: + obj = obj.to_frame() + obj = obj.reindex(["open", "high", "low", "close"], axis=1) + else: + mi = MultiIndex.from_product( + [obj.columns, ["open", "high", "low", "close"]] + ) + obj = obj.reindex(mi, axis=1) + return obj + + return self._downsample("ohlc") + + @final + @doc(SeriesGroupBy.nunique) + def nunique( + self, + *args, + **kwargs, + ): + maybe_warn_args_and_kwargs(type(self), "nunique", args, kwargs) + nv.validate_resampler_func("nunique", args, kwargs) + return self._downsample("nunique") + + @final + @doc(GroupBy.size) + def size(self): + result = self._downsample("size") + + # If the result is a non-empty DataFrame we stack to get a Series + # GH 46826 + if isinstance(result, ABCDataFrame) and not result.empty: + result = result.stack(future_stack=True) + + if not len(self.ax): + from pandas import Series + + if self._selected_obj.ndim == 1: + name = self._selected_obj.name + else: + name = None + result = Series([], index=result.index, dtype="int64", name=name) + return result + + @final + @doc(GroupBy.count) + def count(self): + result = self._downsample("count") + if not len(self.ax): + if self._selected_obj.ndim == 1: + result = type(self._selected_obj)( + [], index=result.index, dtype="int64", name=self._selected_obj.name + ) + else: + from pandas import DataFrame + + result = DataFrame( + [], index=result.index, columns=result.columns, dtype="int64" + ) + + return result + + @final + def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs): + """ + Return value at the given quantile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + + Returns + ------- + DataFrame or Series + Quantile of values within each group. + + See Also + -------- + Series.quantile + Return a series, where the index is q and the values are the quantiles. + DataFrame.quantile + Return a DataFrame, where the columns are the columns of self, + and the values are the quantiles. + DataFrameGroupBy.quantile + Return a DataFrame, where the columns are groupby columns, + and the values are its quantiles. + + Examples + -------- + + >>> ser = pd.Series([1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').quantile() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + + >>> ser.resample('MS').quantile(.25) + 2023-01-01 1.5 + 2023-02-01 3.5 + Freq: MS, dtype: float64 + """ + return self._downsample("quantile", q=q, **kwargs) + + +class _GroupByMixin(PandasObject, SelectionMixin): + """ + Provide the groupby facilities. + """ + + _attributes: list[str] # in practice the same as Resampler._attributes + _selection: IndexLabel | None = None + _groupby: GroupBy + _timegrouper: TimeGrouper + + def __init__( + self, + *, + parent: Resampler, + groupby: GroupBy, + key=None, + selection: IndexLabel | None = None, + include_groups: bool = False, + ) -> None: + # reached via ._gotitem and _get_resampler_for_grouping + + assert isinstance(groupby, GroupBy), type(groupby) + + # parent is always a Resampler, sometimes a _GroupByMixin + assert isinstance(parent, Resampler), type(parent) + + # initialize our GroupByMixin object with + # the resampler attributes + for attr in self._attributes: + setattr(self, attr, getattr(parent, attr)) + self._selection = selection + + self.binner = parent.binner + self.key = key + + self._groupby = groupby + self._timegrouper = copy.copy(parent._timegrouper) + + self.ax = parent.ax + self.obj = parent.obj + self.include_groups = include_groups + + @no_type_check + def _apply(self, f, *args, **kwargs): + """ + Dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object. + """ + + def func(x): + x = self._resampler_cls(x, timegrouper=self._timegrouper, gpr_index=self.ax) + + if isinstance(f, str): + return getattr(x, f)(**kwargs) + + return x.apply(f, *args, **kwargs) + + result = _apply(self._groupby, func, include_groups=self.include_groups) + return self._wrap_result(result) + + _upsample = _apply + _downsample = _apply + _groupby_and_aggregate = _apply + + @final + def _gotitem(self, key, ndim, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + # create a new object to prevent aliasing + if subset is None: + subset = self.obj + if key is not None: + subset = subset[key] + else: + # reached via Apply.agg_dict_like with selection=None, ndim=1 + assert subset.ndim == 1 + + # Try to select from a DataFrame, falling back to a Series + try: + if isinstance(key, list) and self.key not in key and self.key is not None: + key.append(self.key) + groupby = self._groupby[key] + except IndexError: + groupby = self._groupby + + selection = self._infer_selection(key, subset) + + new_rs = type(self)( + groupby=groupby, + parent=cast(Resampler, self), + selection=selection, + ) + return new_rs + + +class DatetimeIndexResampler(Resampler): + ax: DatetimeIndex + + @property + def _resampler_for_grouping(self): + return DatetimeIndexResamplerGroupby + + def _get_binner_for_time(self): + # this is how we are actually creating the bins + if self.kind == "period": + return self._timegrouper._get_time_period_bins(self.ax) + return self._timegrouper._get_time_bins(self.ax) + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function. + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + orig_how = how + how = com.get_cython_func(how) or how + if orig_how != how: + warn_alias_replacement(self, orig_how, how) + ax = self.ax + + # Excludes `on` column when provided + obj = self._obj_with_exclusions + + if not len(ax): + # reset to the new freq + obj = obj.copy() + obj.index = obj.index._with_freq(self.freq) + assert obj.index.freq == self.freq, (obj.index.freq, self.freq) + return obj + + # do we have a regular frequency + + # error: Item "None" of "Optional[Any]" has no attribute "binlabels" + if ( + (ax.freq is not None or ax.inferred_freq is not None) + and len(self._grouper.binlabels) > len(ax) + and how is None + ): + # let's do an asfreq + return self.asfreq() + + # we are downsampling + # we want to call the actual grouper method here + if self.axis == 0: + result = obj.groupby(self._grouper).aggregate(how, **kwargs) + else: + # test_resample_axis1 + result = obj.T.groupby(self._grouper).aggregate(how, **kwargs).T + + return self._wrap_result(result) + + def _adjust_binner_for_upsample(self, binner): + """ + Adjust our binner when upsampling. + + The range of a new index should not be outside specified range + """ + if self.closed == "right": + binner = binner[1:] + else: + binner = binner[:-1] + return binner + + def _upsample(self, method, limit: int | None = None, fill_value=None): + """ + Parameters + ---------- + method : string {'backfill', 'bfill', 'pad', + 'ffill', 'asfreq'} method for upsampling + limit : int, default None + Maximum size gap to fill when reindexing + fill_value : scalar, default None + Value to use for missing values + + See Also + -------- + .fillna: Fill NA/NaN values using the specified method. + + """ + if self.axis: + raise AssertionError("axis must be 0") + if self._from_selection: + raise ValueError( + "Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to datetime-like" + ) + + ax = self.ax + obj = self._selected_obj + binner = self.binner + res_index = self._adjust_binner_for_upsample(binner) + + # if we have the same frequency as our axis, then we are equal sampling + if ( + limit is None + and to_offset(ax.inferred_freq) == self.freq + and len(obj) == len(res_index) + ): + result = obj.copy() + result.index = res_index + else: + if method == "asfreq": + method = None + result = obj.reindex( + res_index, method=method, limit=limit, fill_value=fill_value + ) + + return self._wrap_result(result) + + def _wrap_result(self, result): + result = super()._wrap_result(result) + + # we may have a different kind that we were asked originally + # convert if needed + if self.kind == "period" and not isinstance(result.index, PeriodIndex): + if isinstance(result.index, MultiIndex): + # GH 24103 - e.g. groupby resample + if not isinstance(result.index.levels[-1], PeriodIndex): + new_level = result.index.levels[-1].to_period(self.freq) + result.index = result.index.set_levels(new_level, level=-1) + else: + result.index = result.index.to_period(self.freq) + return result + + +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible +# with definition in base class "DatetimeIndexResampler" +class DatetimeIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, DatetimeIndexResampler +): + """ + Provides a resample of a groupby implementation + """ + + @property + def _resampler_cls(self): + return DatetimeIndexResampler + + +class PeriodIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "PeriodIndex", base + # class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: PeriodIndex # type: ignore[assignment] + + @property + def _resampler_for_grouping(self): + warnings.warn( + "Resampling a groupby with a PeriodIndex is deprecated. " + "Cast to DatetimeIndex before resampling instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return PeriodIndexResamplerGroupby + + def _get_binner_for_time(self): + if self.kind == "timestamp": + return super()._get_binner_for_time() + return self._timegrouper._get_period_bins(self.ax) + + def _convert_obj(self, obj: NDFrameT) -> NDFrameT: + obj = super()._convert_obj(obj) + + if self._from_selection: + # see GH 14008, GH 12871 + msg = ( + "Resampling from level= or on= selection " + "with a PeriodIndex is not currently supported, " + "use .set_index(...) to explicitly set index" + ) + raise NotImplementedError(msg) + + # convert to timestamp + if self.kind == "timestamp": + obj = obj.to_timestamp(how=self.convention) + + return obj + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function. + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + # we may need to actually resample as if we are timestamps + if self.kind == "timestamp": + return super()._downsample(how, **kwargs) + + orig_how = how + how = com.get_cython_func(how) or how + if orig_how != how: + warn_alias_replacement(self, orig_how, how) + ax = self.ax + + if is_subperiod(ax.freq, self.freq): + # Downsampling + return self._groupby_and_aggregate(how, **kwargs) + elif is_superperiod(ax.freq, self.freq): + if how == "ohlc": + # GH #13083 + # upsampling to subperiods is handled as an asfreq, which works + # for pure aggregating/reducing methods + # OHLC reduces along the time dimension, but creates multiple + # values for each period -> handle by _groupby_and_aggregate() + return self._groupby_and_aggregate(how) + return self.asfreq() + elif ax.freq == self.freq: + return self.asfreq() + + raise IncompatibleFrequency( + f"Frequency {ax.freq} cannot be resampled to {self.freq}, " + "as they are not sub or super periods" + ) + + def _upsample(self, method, limit: int | None = None, fill_value=None): + """ + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill'} + Method for upsampling. + limit : int, default None + Maximum size gap to fill when reindexing. + fill_value : scalar, default None + Value to use for missing values. + + See Also + -------- + .fillna: Fill NA/NaN values using the specified method. + + """ + # we may need to actually resample as if we are timestamps + if self.kind == "timestamp": + return super()._upsample(method, limit=limit, fill_value=fill_value) + + ax = self.ax + obj = self.obj + new_index = self.binner + + # Start vs. end of period + memb = ax.asfreq(self.freq, how=self.convention) + + # Get the fill indexer + if method == "asfreq": + method = None + indexer = memb.get_indexer(new_index, method=method, limit=limit) + new_obj = _take_new_index( + obj, + indexer, + new_index, + axis=self.axis, + ) + return self._wrap_result(new_obj) + + +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "PeriodIndexResampler" +class PeriodIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, PeriodIndexResampler +): + """ + Provides a resample of a groupby implementation. + """ + + @property + def _resampler_cls(self): + return PeriodIndexResampler + + +class TimedeltaIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "TimedeltaIndex", + # base class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: TimedeltaIndex # type: ignore[assignment] + + @property + def _resampler_for_grouping(self): + return TimedeltaIndexResamplerGroupby + + def _get_binner_for_time(self): + return self._timegrouper._get_time_delta_bins(self.ax) + + def _adjust_binner_for_upsample(self, binner): + """ + Adjust our binner when upsampling. + + The range of a new index is allowed to be greater than original range + so we don't need to change the length of a binner, GH 13022 + """ + return binner + + +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "DatetimeIndexResampler" +class TimedeltaIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, TimedeltaIndexResampler +): + """ + Provides a resample of a groupby implementation. + """ + + @property + def _resampler_cls(self): + return TimedeltaIndexResampler + + +def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler: + """ + Create a TimeGrouper and return our resampler. + """ + tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type] + return tg._get_resampler(obj, kind=kind) + + +get_resampler.__doc__ = Resampler.__doc__ + + +def get_resampler_for_grouping( + groupby: GroupBy, + rule, + how=None, + fill_method=None, + limit: int | None = None, + kind=None, + on=None, + include_groups: bool = True, + **kwargs, +) -> Resampler: + """ + Return our appropriate resampler when grouping as well. + """ + # .resample uses 'on' similar to how .groupby uses 'key' + tg = TimeGrouper(freq=rule, key=on, **kwargs) + resampler = tg._get_resampler(groupby.obj, kind=kind) + return resampler._get_resampler_for_grouping( + groupby=groupby, include_groups=include_groups, key=tg.key + ) + + +class TimeGrouper(Grouper): + """ + Custom groupby class for time-interval grouping. + + Parameters + ---------- + freq : pandas date offset or offset alias for identifying bin edges + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' + convention : {'start', 'end', 'e', 's'} + If axis is PeriodIndex + """ + + _attributes = Grouper._attributes + ( + "closed", + "label", + "how", + "kind", + "convention", + "origin", + "offset", + ) + + origin: TimeGrouperOrigin + + def __init__( + self, + obj: Grouper | None = None, + freq: Frequency = "Min", + key: str | None = None, + closed: Literal["left", "right"] | None = None, + label: Literal["left", "right"] | None = None, + how: str = "mean", + axis: Axis = 0, + fill_method=None, + limit: int | None = None, + kind: str | None = None, + convention: Literal["start", "end", "e", "s"] | None = None, + origin: Literal["epoch", "start", "start_day", "end", "end_day"] + | TimestampConvertibleTypes = "start_day", + offset: TimedeltaConvertibleTypes | None = None, + group_keys: bool = False, + **kwargs, + ) -> None: + # Check for correctness of the keyword arguments which would + # otherwise silently use the default if misspelled + if label not in {None, "left", "right"}: + raise ValueError(f"Unsupported value {label} for `label`") + if closed not in {None, "left", "right"}: + raise ValueError(f"Unsupported value {closed} for `closed`") + if convention not in {None, "start", "end", "e", "s"}: + raise ValueError(f"Unsupported value {convention} for `convention`") + + if ( + key is None + and obj is not None + and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + or ( + key is not None + and obj is not None + and getattr(obj[key], "dtype", None) == "period" # type: ignore[index] + ) + ): + freq = to_offset(freq, is_period=True) + else: + freq = to_offset(freq) + + end_types = {"ME", "YE", "QE", "BME", "BYE", "BQE", "W"} + rule = freq.rule_code + if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): + if closed is None: + closed = "right" + if label is None: + label = "right" + else: + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``Timestamp`` index stands for the resample result from + # the current ``Timestamp`` minus ``freq`` to the current + # ``Timestamp`` with a right close. + if origin in ["end", "end_day"]: + if closed is None: + closed = "right" + if label is None: + label = "right" + else: + if closed is None: + closed = "left" + if label is None: + label = "left" + + self.closed = closed + self.label = label + self.kind = kind + self.convention = convention if convention is not None else "e" + self.how = how + self.fill_method = fill_method + self.limit = limit + self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None + + if origin in ("epoch", "start", "start_day", "end", "end_day"): + # error: Incompatible types in assignment (expression has type "Union[Union[ + # Timestamp, datetime, datetime64, signedinteger[_64Bit], float, str], + # Literal['epoch', 'start', 'start_day', 'end', 'end_day']]", variable has + # type "Union[Timestamp, Literal['epoch', 'start', 'start_day', 'end', + # 'end_day']]") + self.origin = origin # type: ignore[assignment] + else: + try: + self.origin = Timestamp(origin) + except (ValueError, TypeError) as err: + raise ValueError( + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end', 'end_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead." + ) from err + + try: + self.offset = Timedelta(offset) if offset is not None else None + except (ValueError, TypeError) as err: + raise ValueError( + "'offset' should be a Timedelta convertible type. " + f"Got '{offset}' instead." + ) from err + + # always sort time groupers + kwargs["sort"] = True + + super().__init__(freq=freq, key=key, axis=axis, **kwargs) + + def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: + """ + Return my resampler or raise if we have an invalid axis. + + Parameters + ---------- + obj : Series or DataFrame + kind : string, optional + 'period','timestamp','timedelta' are valid + + Returns + ------- + Resampler + + Raises + ------ + TypeError if incompatible axis + + """ + _, ax, _ = self._set_grouper(obj, gpr_index=None) + if isinstance(ax, DatetimeIndex): + return DatetimeIndexResampler( + obj, + timegrouper=self, + kind=kind, + axis=self.axis, + group_keys=self.group_keys, + gpr_index=ax, + ) + elif isinstance(ax, PeriodIndex) or kind == "period": + if isinstance(ax, PeriodIndex): + # GH#53481 + warnings.warn( + "Resampling with a PeriodIndex is deprecated. " + "Cast index to DatetimeIndex before resampling instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "Resampling with kind='period' is deprecated. " + "Use datetime paths instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return PeriodIndexResampler( + obj, + timegrouper=self, + kind=kind, + axis=self.axis, + group_keys=self.group_keys, + gpr_index=ax, + ) + elif isinstance(ax, TimedeltaIndex): + return TimedeltaIndexResampler( + obj, + timegrouper=self, + axis=self.axis, + group_keys=self.group_keys, + gpr_index=ax, + ) + + raise TypeError( + "Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " + f"but got an instance of '{type(ax).__name__}'" + ) + + def _get_grouper( + self, obj: NDFrameT, validate: bool = True + ) -> tuple[BinGrouper, NDFrameT]: + # create the resampler and return our binner + r = self._get_resampler(obj) + return r._grouper, cast(NDFrameT, r.obj) + + def _get_time_bins(self, ax: DatetimeIndex): + if not isinstance(ax, DatetimeIndex): + raise TypeError( + "axis must be a DatetimeIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + if len(ax) == 0: + binner = labels = DatetimeIndex( + data=[], freq=self.freq, name=ax.name, dtype=ax.dtype + ) + return binner, [], labels + + first, last = _get_timestamp_range_edges( + ax.min(), + ax.max(), + self.freq, + unit=ax.unit, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + # GH #12037 + # use first/last directly instead of call replace() on them + # because replace() will swallow the nanosecond part + # thus last bin maybe slightly before the end if the end contains + # nanosecond part and lead to `Values falls after last bin` error + # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback + # has noted that ambiguous=True provides the most sensible result + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous=True, + nonexistent="shift_forward", + unit=ax.unit, + ) + + ax_values = ax.asi8 + binner, bin_edges = self._adjust_bin_edges(binner, ax_values) + + # general version, knowing nothing about relative frequencies + bins = lib.generate_bins_dt64( + ax_values, bin_edges, self.closed, hasnans=ax.hasnans + ) + + if self.closed == "right": + labels = binner + if self.label == "right": + labels = labels[1:] + elif self.label == "right": + labels = labels[1:] + + if ax.hasnans: + binner = binner.insert(0, NaT) + labels = labels.insert(0, NaT) + + # if we end up with more labels than bins + # adjust the labels + # GH4076 + if len(bins) < len(labels): + labels = labels[: len(bins)] + + return binner, bins, labels + + def _adjust_bin_edges( + self, binner: DatetimeIndex, ax_values: npt.NDArray[np.int64] + ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]: + # Some hacks for > daily data, see #1471, #1458, #1483 + + if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in ( + "BQE", + "BYE", + "QE", + "YE", + "W", + ): + # If the right end-point is on the last day of the month, roll forwards + # until the last moment of that day. Note that we only do this for offsets + # which correspond to the end of a super-daily period - "month start", for + # example, is excluded. + if self.closed == "right": + # GH 21459, GH 9119: Adjust the bins relative to the wall time + edges_dti = binner.tz_localize(None) + edges_dti = ( + edges_dti + + Timedelta(days=1, unit=edges_dti.unit).as_unit(edges_dti.unit) + - Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit) + ) + bin_edges = edges_dti.tz_localize(binner.tz).asi8 + else: + bin_edges = binner.asi8 + + # intraday values on last day + if bin_edges[-2] > ax_values.max(): + bin_edges = bin_edges[:-1] + binner = binner[:-1] + else: + bin_edges = binner.asi8 + return binner, bin_edges + + def _get_time_delta_bins(self, ax: TimedeltaIndex): + if not isinstance(ax, TimedeltaIndex): + raise TypeError( + "axis must be a TimedeltaIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + if not isinstance(self.freq, Tick): + # GH#51896 + raise ValueError( + "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " + f"e.g. '24h' or '3D', not {self.freq}" + ) + + if not len(ax): + binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + start, end = ax.min(), ax.max() + + if self.closed == "right": + end += self.freq + + labels = binner = timedelta_range( + start=start, end=end, freq=self.freq, name=ax.name + ) + + end_stamps = labels + if self.closed == "left": + end_stamps += self.freq + + bins = ax.searchsorted(end_stamps, side=self.closed) + + if self.offset: + # GH 10530 & 31809 + labels += self.offset + + return binner, bins, labels + + def _get_time_period_bins(self, ax: DatetimeIndex): + if not isinstance(ax, DatetimeIndex): + raise TypeError( + "axis must be a DatetimeIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + freq = self.freq + + if len(ax) == 0: + binner = labels = PeriodIndex( + data=[], freq=freq, name=ax.name, dtype=ax.dtype + ) + return binner, [], labels + + labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name) + + end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() + if ax.tz: + end_stamps = end_stamps.tz_localize(ax.tz) + bins = ax.searchsorted(end_stamps, side="left") + + return binner, bins, labels + + def _get_period_bins(self, ax: PeriodIndex): + if not isinstance(ax, PeriodIndex): + raise TypeError( + "axis must be a PeriodIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + memb = ax.asfreq(self.freq, how=self.convention) + + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + nat_count = 0 + if memb.hasnans: + # error: Incompatible types in assignment (expression has type + # "bool_", variable has type "int") [assignment] + nat_count = np.sum(memb._isnan) # type: ignore[assignment] + memb = memb[~memb._isnan] + + if not len(memb): + # index contains no valid (non-NaT) values + bins = np.array([], dtype=np.int64) + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) + if len(ax) > 0: + # index is all NaT + binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax)) + return binner, bins, labels + + freq_mult = self.freq.n + + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how="end") + bin_shift = 0 + + if isinstance(self.freq, Tick): + # GH 23882 & 31809: get adjusted bin edge labels with 'origin' + # and 'origin' support. This call only makes sense if the freq is a + # Tick since offset and origin are only used in those cases. + # Not doing this check could create an extra empty bin. + p_start, end = _get_period_range_edges( + start, + end, + self.freq, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + + # Get offset for bin edge (not label edge) adjustment + start_offset = Period(start, self.freq) - Period(p_start, self.freq) + # error: Item "Period" of "Union[Period, Any]" has no attribute "n" + bin_shift = start_offset.n % freq_mult # type: ignore[union-attr] + start = p_start + + labels = binner = period_range( + start=start, end=end, freq=self.freq, name=ax.name + ) + + i8 = memb.asi8 + + # when upsampling to subperiods, we need to generate enough bins + expected_bins_count = len(binner) * freq_mult + i8_extend = expected_bins_count - (i8[-1] - i8[0]) + rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) + rng += freq_mult + # adjust bin edge indexes to account for base + rng -= bin_shift + + # Wrap in PeriodArray for PeriodArray.searchsorted + prng = type(memb._data)(rng, dtype=memb.dtype) + bins = memb.searchsorted(prng, side="left") + + if nat_count > 0: + binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count) + + return binner, bins, labels + + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) + return obj, ax, indexer + + +def _take_new_index( + obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 +) -> NDFrameT: + if isinstance(obj, ABCSeries): + new_values = algos.take_nd(obj._values, indexer) + # error: Incompatible return value type (got "Series", expected "NDFrameT") + return obj._constructor( # type: ignore[return-value] + new_values, index=new_index, name=obj.name + ) + elif isinstance(obj, ABCDataFrame): + if axis == 1: + raise NotImplementedError("axis 1 is not supported") + new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + # error: Incompatible return value type (got "DataFrame", expected "NDFrameT") + return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value] + else: + raise ValueError("'obj' should be either a Series or a DataFrame") + + +def _get_timestamp_range_edges( + first: Timestamp, + last: Timestamp, + freq: BaseOffset, + unit: str, + closed: Literal["right", "left"] = "left", + origin: TimeGrouperOrigin = "start_day", + offset: Timedelta | None = None, +) -> tuple[Timestamp, Timestamp]: + """ + Adjust the `first` Timestamp to the preceding Timestamp that resides on + the provided offset. Adjust the `last` Timestamp to the following + Timestamp that resides on the provided offset. Input Timestamps that + already reside on the offset will be adjusted depending on the type of + offset and the `closed` parameter. + + Parameters + ---------- + first : pd.Timestamp + The beginning Timestamp of the range to be adjusted. + last : pd.Timestamp + The ending Timestamp of the range to be adjusted. + freq : pd.DateOffset + The dateoffset to which the Timestamps will be adjusted. + closed : {'right', 'left'}, default "left" + Which side of bin interval is closed. + origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Timestamp objects. + """ + if isinstance(freq, Tick): + index_tz = first.tz + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): + raise ValueError("The origin must have the same timezone as the index.") + if origin == "epoch": + # set the epoch based on the timezone to have similar bins results when + # resampling on the same kind of indexes on different timezones + origin = Timestamp("1970-01-01", tz=index_tz) + + if isinstance(freq, Day): + # _adjust_dates_anchored assumes 'D' means 24h, but first/last + # might contain a DST transition (23h, 24h, or 25h). + # So "pretend" the dates are naive when adjusting the endpoints + first = first.tz_localize(None) + last = last.tz_localize(None) + if isinstance(origin, Timestamp): + origin = origin.tz_localize(None) + + first, last = _adjust_dates_anchored( + first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit + ) + if isinstance(freq, Day): + first = first.tz_localize(index_tz) + last = last.tz_localize(index_tz) + else: + first = first.normalize() + last = last.normalize() + + if closed == "left": + first = Timestamp(freq.rollback(first)) + else: + first = Timestamp(first - freq) + + last = Timestamp(last + freq) + + return first, last + + +def _get_period_range_edges( + first: Period, + last: Period, + freq: BaseOffset, + closed: Literal["right", "left"] = "left", + origin: TimeGrouperOrigin = "start_day", + offset: Timedelta | None = None, +) -> tuple[Period, Period]: + """ + Adjust the provided `first` and `last` Periods to the respective Period of + the given offset that encompasses them. + + Parameters + ---------- + first : pd.Period + The beginning Period of the range to be adjusted. + last : pd.Period + The ending Period of the range to be adjusted. + freq : pd.DateOffset + The freq to which the Periods will be adjusted. + closed : {'right', 'left'}, default "left" + Which side of bin interval is closed. + origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Period objects. + """ + if not all(isinstance(obj, Period) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type Period") + + # GH 23882 + first_ts = first.to_timestamp() + last_ts = last.to_timestamp() + adjust_first = not freq.is_on_offset(first_ts) + adjust_last = freq.is_on_offset(last_ts) + + first_ts, last_ts = _get_timestamp_range_edges( + first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset + ) + + first = (first_ts + int(adjust_first) * freq).to_period(freq) + last = (last_ts - int(adjust_last) * freq).to_period(freq) + return first, last + + +def _insert_nat_bin( + binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int +) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + assert nat_count > 0 + bins += nat_count + bins = np.insert(bins, 0, nat_count) + + # Incompatible types in assignment (expression has type "Index", variable + # has type "PeriodIndex") + binner = binner.insert(0, NaT) # type: ignore[assignment] + # Incompatible types in assignment (expression has type "Index", variable + # has type "PeriodIndex") + labels = labels.insert(0, NaT) # type: ignore[assignment] + return binner, bins, labels + + +def _adjust_dates_anchored( + first: Timestamp, + last: Timestamp, + freq: Tick, + closed: Literal["right", "left"] = "right", + origin: TimeGrouperOrigin = "start_day", + offset: Timedelta | None = None, + unit: str = "ns", +) -> tuple[Timestamp, Timestamp]: + # First and last offsets should be calculated from the start day to fix an + # error cause by resampling across multiple days when a one day period is + # not a multiple of the frequency. See GH 8683 + # To handle frequencies that are not multiple or divisible by a day we let + # the possibility to define a fixed origin timestamp. See GH 31809 + first = first.as_unit(unit) + last = last.as_unit(unit) + if offset is not None: + offset = offset.as_unit(unit) + + freq_value = Timedelta(freq).as_unit(unit)._value + + origin_timestamp = 0 # origin == "epoch" + if origin == "start_day": + origin_timestamp = first.normalize()._value + elif origin == "start": + origin_timestamp = first._value + elif isinstance(origin, Timestamp): + origin_timestamp = origin.as_unit(unit)._value + elif origin in ["end", "end_day"]: + origin_last = last if origin == "end" else last.ceil("D") + sub_freq_times = (origin_last._value - first._value) // freq_value + if closed == "left": + sub_freq_times += 1 + first = origin_last - sub_freq_times * freq + origin_timestamp = first._value + origin_timestamp += offset._value if offset else 0 + + # GH 10117 & GH 19375. If first and last contain timezone information, + # Perform the calculation in UTC in order to avoid localizing on an + # Ambiguous or Nonexistent time. + first_tzinfo = first.tzinfo + last_tzinfo = last.tzinfo + if first_tzinfo is not None: + first = first.tz_convert("UTC") + if last_tzinfo is not None: + last = last.tz_convert("UTC") + + foffset = (first._value - origin_timestamp) % freq_value + loffset = (last._value - origin_timestamp) % freq_value + + if closed == "right": + if foffset > 0: + # roll back + fresult_int = first._value - foffset + else: + fresult_int = first._value - freq_value + + if loffset > 0: + # roll forward + lresult_int = last._value + (freq_value - loffset) + else: + # already the end of the road + lresult_int = last._value + else: # closed == 'left' + if foffset > 0: + fresult_int = first._value - foffset + else: + # start of the road + fresult_int = first._value + + if loffset > 0: + # roll forward + lresult_int = last._value + (freq_value - loffset) + else: + lresult_int = last._value + freq_value + fresult = Timestamp(fresult_int, unit=unit) + lresult = Timestamp(lresult_int, unit=unit) + if first_tzinfo is not None: + fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) + if last_tzinfo is not None: + lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) + return fresult, lresult + + +def asfreq( + obj: NDFrameT, + freq, + method=None, + how=None, + normalize: bool = False, + fill_value=None, +) -> NDFrameT: + """ + Utility frequency conversion method for Series/DataFrame. + + See :meth:`pandas.NDFrame.asfreq` for full documentation. + """ + if isinstance(obj.index, PeriodIndex): + if method is not None: + raise NotImplementedError("'method' argument is not supported") + + if how is None: + how = "E" + + if isinstance(freq, BaseOffset): + if hasattr(freq, "_period_dtype_code"): + freq = freq_to_period_freqstr(freq.n, freq.name) + else: + raise ValueError( + f"Invalid offset: '{freq.base}' for converting time series " + f"with PeriodIndex." + ) + + new_obj = obj.copy() + new_obj.index = obj.index.asfreq(freq, how=how) + + elif len(obj.index) == 0: + new_obj = obj.copy() + + new_obj.index = _asfreq_compat(obj.index, freq) + else: + unit = None + if isinstance(obj.index, DatetimeIndex): + # TODO: should we disallow non-DatetimeIndex? + unit = obj.index.unit + dti = date_range(obj.index.min(), obj.index.max(), freq=freq, unit=unit) + dti.name = obj.index.name + new_obj = obj.reindex(dti, method=method, fill_value=fill_value) + if normalize: + new_obj.index = new_obj.index.normalize() + + return new_obj + + +def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq): + """ + Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex. + + Parameters + ---------- + index : PeriodIndex, DatetimeIndex, or TimedeltaIndex + freq : DateOffset + + Returns + ------- + same type as index + """ + if len(index) != 0: + # This should never be reached, always checked by the caller + raise ValueError( + "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex" + ) + new_index: Index + if isinstance(index, PeriodIndex): + new_index = index.asfreq(freq=freq) + elif isinstance(index, DatetimeIndex): + new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name) + elif isinstance(index, TimedeltaIndex): + new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name) + else: # pragma: no cover + raise TypeError(type(index)) + return new_index + + +def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: + """ + Warn for deprecation of args and kwargs in resample functions. + + Parameters + ---------- + cls : type + Class to warn about. + kernel : str + Operation name. + args : tuple or None + args passed by user. Will be None if and only if kernel does not have args. + kwargs : dict or None + kwargs passed by user. Will be None if and only if kernel does not have kwargs. + """ + warn_args = args is not None and len(args) > 0 + warn_kwargs = kwargs is not None and len(kwargs) > 0 + if warn_args and warn_kwargs: + msg = "args and kwargs" + elif warn_args: + msg = "args" + elif warn_kwargs: + msg = "kwargs" + else: + return + warnings.warn( + f"Passing additional {msg} to {cls.__name__}.{kernel} has " + "no impact on the result and is deprecated. This will " + "raise a TypeError in a future version of pandas.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + + +def _apply( + grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs +) -> DataFrame: + # GH#7155 - rewrite warning to appear as if it came from `.resample` + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") + with rewrite_warning( + target_message=target_message, + target_category=DeprecationWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sample.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sample.py new file mode 100644 index 0000000000000000000000000000000000000000..eebbed3512c4eca42e401c40605838c6f69011df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sample.py @@ -0,0 +1,154 @@ +""" +Module containing utilities for NDFrame.sample() and .GroupBy.sample() +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +if TYPE_CHECKING: + from pandas._typing import AxisInt + + from pandas.core.generic import NDFrame + + +def preprocess_weights(obj: NDFrame, weights, axis: AxisInt) -> np.ndarray: + """ + Process and validate the `weights` argument to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns `weights` as an ndarray[np.float64], validated except for normalizing + weights (because that must be done groupwise in groupby sampling). + """ + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(obj.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(obj, ABCDataFrame): + if axis == 0: + try: + weights = obj[weights] + except KeyError as err: + raise KeyError( + "String passed to weights not a valid column" + ) from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights when sampling from a Series." + ) + + if isinstance(obj, ABCSeries): + func = obj._constructor + else: + func = obj._constructor_sliced + + weights = func(weights, dtype="float64")._values + + if len(weights) != obj.shape[axis]: + raise ValueError("Weights and axis to be sampled must be of same length") + + if lib.has_infs(weights): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + missing = np.isnan(weights) + if missing.any(): + # Don't modify weights in place + weights = weights.copy() + weights[missing] = 0 + return weights + + +def process_sampling_size( + n: int | None, frac: float | None, replace: bool +) -> int | None: + """ + Process and validate the `n` and `frac` arguments to `NDFrame.sample` and + `.GroupBy.sample`. + + Returns None if `frac` should be used (variable sampling sizes), otherwise returns + the constant sampling size. + """ + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif n is not None and frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + elif n is not None: + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide `n` >= 0." + ) + if n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + else: + assert frac is not None # for mypy + if frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + if frac < 0: + raise ValueError( + "A negative number of rows requested. Please provide `frac` >= 0." + ) + + return n + + +def sample( + obj_len: int, + size: int, + replace: bool, + weights: np.ndarray | None, + random_state: np.random.RandomState | np.random.Generator, +) -> np.ndarray: + """ + Randomly sample `size` indices in `np.arange(obj_len)` + + Parameters + ---------- + obj_len : int + The length of the indices being considered + size : int + The number of values to choose + replace : bool + Allow or disallow sampling of the same row more than once. + weights : np.ndarray[np.float64] or None + If None, equal probability weighting, otherwise weights according + to the vector normalized + random_state: np.random.RandomState or np.random.Generator + State used for the random sampling + + Returns + ------- + np.ndarray[np.intp] + """ + if weights is not None: + weight_sum = weights.sum() + if weight_sum != 0: + weights = weights / weight_sum + else: + raise ValueError("Invalid weights: weights sum to zero") + + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( + np.intp, copy=False + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/series.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/series.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd019656d207b12e01c9f5bc7260c8ff84bbdf5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/series.py @@ -0,0 +1,6631 @@ +""" +Data structure for 1-dimensional cross-sectional and time series data +""" +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterable, + Mapping, + Sequence, +) +import operator +import sys +from textwrap import dedent +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) +import warnings +import weakref + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) +from pandas._config.config import _get_option + +from pandas._libs import ( + lib, + properties, + reshape, +) +from pandas._libs.lib import is_range_indexer +from pandas.compat import PYPY +from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import ( + ChainedAssignmentError, + InvalidIndexError, + _chained_assignment_method_msg, + _chained_assignment_msg, + _chained_assignment_warning_method_msg, + _chained_assignment_warning_msg, + _check_cacher, +) +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_nonkeyword_arguments, + doc, +) +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import ( + validate_ascending, + validate_bool_kwarg, + validate_percentile, +) + +from pandas.core.dtypes.astype import astype_is_view +from pandas.core.dtypes.cast import ( + LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, + maybe_box_native, + maybe_cast_pointwise_result, +) +from pandas.core.dtypes.common import ( + is_dict_like, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, + validate_all_hashable, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, + remove_na_arraylike, +) + +from pandas.core import ( + algorithms, + base, + common as com, + missing, + nanops, + ops, + roperator, +) +from pandas.core.accessor import CachedAccessor +from pandas.core.apply import SeriesApply +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.arrow import ( + ListAccessor, + StructAccessor, +) +from pandas.core.arrays.categorical import CategoricalAccessor +from pandas.core.arrays.sparse import SparseAccessor +from pandas.core.arrays.string_ import StringDtype +from pandas.core.construction import ( + array as pd_array, + extract_array, + sanitize_array, +) +from pandas.core.generic import ( + NDFrame, + make_doc, +) +from pandas.core.indexers import ( + disallow_ndim_indexing, + unpack_1tuple, +) +from pandas.core.indexes.accessors import CombinedDatetimelikeProperties +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + default_index, + ensure_index, +) +import pandas.core.indexes.base as ibase +from pandas.core.indexes.multi import maybe_droplevels +from pandas.core.indexing import ( + check_bool_indexer, + check_dict_or_set_indexers, +) +from pandas.core.internals import ( + SingleArrayManager, + SingleBlockManager, +) +from pandas.core.methods import selectn +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ( + ensure_key_mapped, + nargsort, +) +from pandas.core.strings.accessor import StringMethods +from pandas.core.tools.datetimes import to_datetime + +import pandas.io.formats.format as fmt +from pandas.io.formats.info import ( + INFO_DOCSTRING, + SeriesInfo, + series_sub_kwargs, +) +import pandas.plotting + +if TYPE_CHECKING: + from pandas._libs.internals import BlockValuesRefs + from pandas._typing import ( + AggFuncType, + AnyAll, + AnyArrayLike, + ArrayLike, + Axis, + AxisInt, + CorrelationMethod, + DropKeep, + Dtype, + DtypeObj, + FilePath, + Frequency, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + Level, + MutableMappingT, + NaPosition, + NumpySorter, + NumpyValueArrayLike, + QuantileInterpolation, + ReindexMethod, + Renamer, + Scalar, + Self, + SingleManager, + SortKind, + StorageOptions, + Suffixes, + ValueKeyFunc, + WriteBuffer, + npt, + ) + + from pandas.core.frame import DataFrame + from pandas.core.groupby.generic import SeriesGroupBy + +__all__ = ["Series"] + +_shared_doc_kwargs = { + "axes": "index", + "klass": "Series", + "axes_single_arg": "{0 or 'index'}", + "axis": """axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame.""", + "inplace": """inplace : bool, default False + If True, performs operation inplace and returns None.""", + "unique": "np.ndarray", + "duplicated": "Series", + "optional_by": "", + "optional_reindex": """ +index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. +axis : int or str, optional + Unused.""", +} + + +def _coerce_method(converter): + """ + Install the scalar coercion methods. + """ + + def wrapper(self): + if len(self) == 1: + warnings.warn( + f"Calling {converter.__name__} on a single element Series is " + "deprecated and will raise a TypeError in the future. " + f"Use {converter.__name__}(ser.iloc[0]) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return converter(self.iloc[0]) + raise TypeError(f"cannot convert the series to {converter}") + + wrapper.__name__ = f"__{converter.__name__}__" + return wrapper + + +# ---------------------------------------------------------------------- +# Series class + + +# error: Cannot override final attribute "ndim" (previously declared in base +# class "NDFrame") +# error: Cannot override final attribute "size" (previously declared in base +# class "NDFrame") +# definition in base class "NDFrame" +class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] + """ + One-dimensional ndarray with axis labels (including time series). + + Labels need not be unique but must be a hashable type. The object + supports both integer- and label-based indexing and provides a host of + methods for performing operations involving the index. Statistical + methods from ndarray have been overridden to automatically exclude + missing data (currently represented as NaN). + + Operations between Series (+, -, /, \\*, \\*\\*) align values based on their + associated index values-- they need not be the same length. The result + index will be the sorted union of the two indexes. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. If data is a dict, argument order is + maintained. + index : array-like or Index (1d) + Values must be hashable and have the same length as `data`. + Non-unique index values are allowed. Will default to + RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like + and index is None, then the keys in the data are used as the index. If the + index is not None, the resulting Series is reindexed with the index values. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this will be + inferred from `data`. + See the :ref:`user guide ` for more usages. + name : Hashable, default None + The name to give to the Series. + copy : bool, default False + Copy input data. Only affects Series or 1d ndarray input. See examples. + + Notes + ----- + Please reference the :ref:`User Guide ` for more information. + + Examples + -------- + Constructing Series from a dictionary with an Index specified + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> ser + a 1 + b 2 + c 3 + dtype: int64 + + The keys of the dictionary match with the Index values, hence the Index + values have no effect. + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> ser + x NaN + y NaN + z NaN + dtype: float64 + + Note that the Index is first build with the keys from the dictionary. + After this the Series is reindexed with the given Index values, hence we + get all NaN as a result. + + Constructing Series from a list with `copy=False`. + + >>> r = [1, 2] + >>> ser = pd.Series(r, copy=False) + >>> ser.iloc[0] = 999 + >>> r + [1, 2] + >>> ser + 0 999 + 1 2 + dtype: int64 + + Due to input data type the Series has a `copy` of + the original data even though `copy=False`, so + the data is unchanged. + + Constructing Series from a 1d ndarray with `copy=False`. + + >>> r = np.array([1, 2]) + >>> ser = pd.Series(r, copy=False) + >>> ser.iloc[0] = 999 + >>> r + array([999, 2]) + >>> ser + 0 999 + 1 2 + dtype: int64 + + Due to input data type the Series has a `view` on + the original data, so + the data is changed as well. + """ + + _typ = "series" + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) + + _name: Hashable + _metadata: list[str] = ["_name"] + _internal_names_set = {"index", "name"} | NDFrame._internal_names_set + _accessors = {"dt", "cat", "str", "sparse"} + _hidden_attrs = ( + base.IndexOpsMixin._hidden_attrs | NDFrame._hidden_attrs | frozenset([]) + ) + + # similar to __array_priority__, positions Series after DataFrame + # but before Index and ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 3000 + + # Override cache_readonly bc Series is mutable + # error: Incompatible types in assignment (expression has type "property", + # base class "IndexOpsMixin" defined the type as "Callable[[IndexOpsMixin], bool]") + hasnans = property( # type: ignore[assignment] + # error: "Callable[[IndexOpsMixin], bool]" has no attribute "fget" + base.IndexOpsMixin.hasnans.fget, # type: ignore[attr-defined] + doc=base.IndexOpsMixin.hasnans.__doc__, + ) + _mgr: SingleManager + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index=None, + dtype: Dtype | None = None, + name=None, + copy: bool | None = None, + fastpath: bool | lib.NoDefault = lib.no_default, + ) -> None: + if fastpath is not lib.no_default: + warnings.warn( + "The 'fastpath' keyword in pd.Series is deprecated and will " + "be removed in a future version.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + + allow_mgr = False + if ( + isinstance(data, (SingleBlockManager, SingleArrayManager)) + and index is None + and dtype is None + and (copy is False or copy is None) + ): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if using_copy_on_write(): + data = data.copy(deep=False) + # GH#33357 called with just the SingleBlockManager + NDFrame.__init__(self, data) + if fastpath: + # e.g. from _box_col_values, skip validation of name + object.__setattr__(self, "_name", name) + else: + self.name = name + return + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + + if isinstance(data, (ExtensionArray, np.ndarray)): + if copy is not False and using_copy_on_write(): + if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): + data = data.copy() + if copy is None: + copy = False + + # we are called internally, so short-circuit + if fastpath: + # data is a ndarray, index is defined + if not isinstance(data, (SingleBlockManager, SingleArrayManager)): + manager = _get_option("mode.data_manager", silent=True) + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) + allow_mgr = True + elif using_copy_on_write() and not copy: + data = data.copy(deep=False) + + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + + if copy: + data = data.copy() + # skips validation of the name + object.__setattr__(self, "_name", name) + NDFrame.__init__(self, data) + return + + if isinstance(data, SingleBlockManager) and using_copy_on_write() and not copy: + data = data.copy(deep=False) + + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + + name = ibase.maybe_extract_name(name, data, type(self)) + + if index is not None: + index = ensure_index(index) + + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if data is None: + index = index if index is not None else default_index(0) + if len(index) or dtype is not None: + data = na_value_for_dtype(pandas_dtype(dtype), compat=False) + else: + data = [] + + if isinstance(data, MultiIndex): + raise NotImplementedError( + "initializing a Series from a MultiIndex is not supported" + ) + + refs = None + if isinstance(data, Index): + if dtype is not None: + data = data.astype(dtype, copy=False) + + if using_copy_on_write(): + refs = data._references + data = data._values + else: + # GH#24096 we need to ensure the index remains immutable + data = data._values.copy() + copy = False + + elif isinstance(data, np.ndarray): + if len(data.dtype): + # GH#13296 we are dealing with a compound dtype, which + # should be treated as 2D + raise ValueError( + "Cannot construct a Series from an ndarray with " + "compound dtype. Use DataFrame instead." + ) + elif isinstance(data, Series): + if index is None: + index = data.index + data = data._mgr.copy(deep=False) + else: + data = data.reindex(index, copy=copy) + copy = False + data = data._mgr + elif isinstance(data, Mapping): + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False + elif isinstance(data, (SingleBlockManager, SingleArrayManager)): + if index is None: + index = data.index + elif not data.index.equals(index) or copy: + # GH#19275 SingleBlockManager input should only be called + # internally + raise AssertionError( + "Cannot pass both SingleBlockManager " + "`data` argument and a different " + "`index` argument. `copy` must be False." + ) + + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + allow_mgr = True + + elif isinstance(data, ExtensionArray): + pass + else: + data = com.maybe_iterable_to_list(data) + if is_list_like(data) and not len(data) and dtype is None: + # GH 29405: Pre-2.0, this defaulted to float. + dtype = np.dtype(object) + + if index is None: + if not is_list_like(data): + data = [data] + index = default_index(len(data)) + elif is_list_like(data): + com.require_length_match(data, index) + + # create/copy the manager + if isinstance(data, (SingleBlockManager, SingleArrayManager)): + if dtype is not None: + data = data.astype(dtype=dtype, errors="ignore", copy=copy) + elif copy: + data = data.copy() + else: + data = sanitize_array(data, index, dtype, copy) + + manager = _get_option("mode.data_manager", silent=True) + if manager == "block": + data = SingleBlockManager.from_array(data, index, refs=refs) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) + + NDFrame.__init__(self, data) + self.name = name + self._set_axis(0, index) + + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Series " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + def _init_dict( + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None + ): + """ + Derive the "_mgr" and "index" attributes of a new Series from a + dictionary input. + + Parameters + ---------- + data : dict or dict-like + Data used to populate the new Series. + index : Index or None, default None + Index for the new Series: if None, use dict keys. + dtype : np.dtype, ExtensionDtype, or None, default None + The dtype for the new Series: if None, infer from data. + + Returns + ------- + _data : BlockManager for the new Series + index : index for the new Series + """ + keys: Index | tuple + + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] + # raises KeyError), so we iterate the entire dict, and align + if data: + # GH:34717, issue was using zip to extract key and values from data. + # using generators in effects the performance. + # Below is the new way of extracting the keys and values + + keys = tuple(data.keys()) + values = list(data.values()) # Generating list of values- faster way + elif index is not None: + # fastpath for Series(data=None). Just use broadcasting a scalar + # instead of reindexing. + if len(index) or dtype is not None: + values = na_value_for_dtype(pandas_dtype(dtype), compat=False) + else: + values = [] + keys = index + else: + keys, values = default_index(0), [] + + # Input is now list-like, so rely on "standard" construction: + s = Series(values, index=keys, dtype=dtype) + + # Now we just make sure the order is respected, if any + if data and index is not None: + s = s.reindex(index, copy=False) + return s._mgr, s.index + + # ---------------------------------------------------------------------- + + @property + def _constructor(self) -> Callable[..., Series]: + return Series + + def _constructor_from_mgr(self, mgr, axes): + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. + return ser + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) + + @property + def _constructor_expanddim(self) -> Callable[..., DataFrame]: + """ + Used when a manipulation result has one higher dimension as the + original, such as Series.to_frame() + """ + from pandas.core.frame import DataFrame + + return DataFrame + + def _constructor_expanddim_from_mgr(self, mgr, axes): + from pandas.core.frame import DataFrame + + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df + + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) + + # types + @property + def _can_hold_na(self) -> bool: + return self._mgr._can_hold_na + + # ndarray compatibility + @property + def dtype(self) -> DtypeObj: + """ + Return the dtype object of the underlying data. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.dtype + dtype('int64') + """ + return self._mgr.dtype + + @property + def dtypes(self) -> DtypeObj: + """ + Return the dtype object of the underlying data. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.dtypes + dtype('int64') + """ + # DataFrame compatibility + return self.dtype + + @property + def name(self) -> Hashable: + """ + Return the name of the Series. + + The name of a Series becomes its index or column name if it is used + to form a DataFrame. It is also used whenever displaying the Series + using the interpreter. + + Returns + ------- + label (hashable object) + The name of the Series, also the column name if part of a DataFrame. + + See Also + -------- + Series.rename : Sets the Series name when given a scalar input. + Index.name : Corresponding Index property. + + Examples + -------- + The Series name can be set initially when calling the constructor. + + >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers') + >>> s + 0 1 + 1 2 + 2 3 + Name: Numbers, dtype: int64 + >>> s.name = "Integers" + >>> s + 0 1 + 1 2 + 2 3 + Name: Integers, dtype: int64 + + The name of a Series within a DataFrame is its column name. + + >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], + ... columns=["Odd Numbers", "Even Numbers"]) + >>> df + Odd Numbers Even Numbers + 0 1 2 + 1 3 4 + 2 5 6 + >>> df["Even Numbers"].name + 'Even Numbers' + """ + return self._name + + @name.setter + def name(self, value: Hashable) -> None: + validate_all_hashable(value, error_name=f"{type(self).__name__}.name") + object.__setattr__(self, "_name", value) + + @property + def values(self): + """ + Return Series as ndarray or ndarray-like depending on the dtype. + + .. warning:: + + We recommend using :attr:`Series.array` or + :meth:`Series.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + Returns + ------- + numpy.ndarray or ndarray-like + + See Also + -------- + Series.array : Reference to the underlying data. + Series.to_numpy : A NumPy array representing the underlying data. + + Examples + -------- + >>> pd.Series([1, 2, 3]).values + array([1, 2, 3]) + + >>> pd.Series(list('aabc')).values + array(['a', 'a', 'b', 'c'], dtype=object) + + >>> pd.Series(list('aabc')).astype('category').values + ['a', 'a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] + + Timezone aware datetime data is converted to UTC: + + >>> pd.Series(pd.date_range('20130101', periods=3, + ... tz='US/Eastern')).values + array(['2013-01-01T05:00:00.000000000', + '2013-01-02T05:00:00.000000000', + '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') + """ + return self._mgr.external_values() + + @property + def _values(self): + """ + Return the internal repr of this data (defined by Block.interval_values). + This are the values as stored in the Block (ndarray or ExtensionArray + depending on the Block class), with datetime64[ns] and timedelta64[ns] + wrapped in ExtensionArrays to match Index._values behavior. + + Differs from the public ``.values`` for certain data types, because of + historical backwards compatibility of the public attribute (e.g. period + returns object ndarray and datetimetz a datetime64[ns] ndarray for + ``.values`` while it returns an ExtensionArray for ``._values`` in those + cases). + + Differs from ``.array`` in that this still returns the numpy array if + the Block is backed by a numpy array (except for datetime64 and + timedelta64 dtypes), while ``.array`` ensures to always return an + ExtensionArray. + + Overview: + + dtype | values | _values | array | + ----------- | ------------- | ------------- | --------------------- | + Numeric | ndarray | ndarray | NumpyExtensionArray | + Category | Categorical | Categorical | Categorical | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + td64[ns] | ndarray[m8ns] | TimedeltaArray| TimedeltaArray | + Period | ndarray[obj] | PeriodArray | PeriodArray | + Nullable | EA | EA | EA | + + """ + return self._mgr.internal_values() + + @property + def _references(self) -> BlockValuesRefs | None: + if isinstance(self._mgr, SingleArrayManager): + return None + return self._mgr._block.refs + + # error: Decorated property not supported + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] + @property + def array(self) -> ExtensionArray: + return self._mgr.array_values() + + # ops + def ravel(self, order: str = "C") -> ArrayLike: + """ + Return the flattened underlying data as an ndarray or ExtensionArray. + + .. deprecated:: 2.2.0 + Series.ravel is deprecated. The underlying array is already 1D, so + ravel is not necessary. Use :meth:`to_numpy` for conversion to a numpy + array instead. + + Returns + ------- + numpy.ndarray or ExtensionArray + Flattened data of the Series. + + See Also + -------- + numpy.ndarray.ravel : Return a flattened array. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.ravel() # doctest: +SKIP + array([1, 2, 3]) + """ + warnings.warn( + "Series.ravel is deprecated. The underlying array is already 1D, so " + "ravel is not necessary. Use `to_numpy()` for conversion to a numpy " + "array instead.", + FutureWarning, + stacklevel=2, + ) + arr = self._values.ravel(order=order) + if isinstance(arr, np.ndarray) and using_copy_on_write(): + arr.flags.writeable = False + return arr + + def __len__(self) -> int: + """ + Return the length of the Series. + """ + return len(self._mgr) + + def view(self, dtype: Dtype | None = None) -> Series: + """ + Create a new view of the Series. + + .. deprecated:: 2.2.0 + ``Series.view`` is deprecated and will be removed in a future version. + Use :meth:`Series.astype` as an alternative to change the dtype. + + This function will return a new Series with a view of the same + underlying values in memory, optionally reinterpreted with a new data + type. The new data type must preserve the same size in bytes as to not + cause index misalignment. + + Parameters + ---------- + dtype : data type + Data type object or one of their string representations. + + Returns + ------- + Series + A new Series object as a view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Equivalent numpy function to create a new view of + the same data in memory. + + Notes + ----- + Series are instantiated with ``dtype=float64`` by default. While + ``numpy.ndarray.view()`` will return a view with the same data type as + the original array, ``Series.view()`` (without specified dtype) + will try using ``float64`` and may fail if the original data type size + in bytes is not the same. + + Examples + -------- + Use ``astype`` to change the dtype instead. + """ + warnings.warn( + "Series.view is deprecated and will be removed in a future version. " + "Use ``astype`` as an alternative to change the dtype.", + FutureWarning, + stacklevel=2, + ) + # self.array instead of self._values so we piggyback on NumpyExtensionArray + # implementation + res_values = self.array.view(dtype) + res_ser = self._constructor(res_values, index=self.index, copy=False) + if isinstance(res_ser._mgr, SingleBlockManager): + blk = res_ser._mgr._block + blk.refs = cast("BlockValuesRefs", self._references) + blk.refs.add_reference(blk) + return res_ser.__finalize__(self, method="view") + + # ---------------------------------------------------------------------- + # NDArray Compat + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: + """ + Return the values as a NumPy array. + + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + copy : bool or None, optional + Unused. + + Returns + ------- + numpy.ndarray + The values in the series converted to a :class:`numpy.ndarray` + with the specified `dtype`. + + See Also + -------- + array : Create a new array from data. + Series.array : Zero-copy view to the array backing the Series. + Series.to_numpy : Series method for similar behavior. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]) + >>> np.asarray(ser) + array([1, 2, 3]) + + For timezone-aware data, the timezones may be retained with + ``dtype='object'`` + + >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> np.asarray(tzser, dtype="object") + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + dtype=object) + + Or the values may be localized to UTC and the tzinfo discarded with + ``dtype='datetime64[ns]'`` + + >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS + array(['1999-12-31T23:00:00.000000000', ...], + dtype='datetime64[ns]') + """ + values = self._values + arr = np.asarray(values, dtype=dtype) + if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): + arr = arr.view() + arr.flags.writeable = False + return arr + + # ---------------------------------------------------------------------- + + def __column_consortium_standard__(self, *, api_version: str | None = None) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of pandas. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + dataframe_api_compat = import_optional_dependency("dataframe_api_compat") + return ( + dataframe_api_compat.pandas_standard.convert_to_standard_compliant_column( + self, api_version=api_version + ) + ) + + # ---------------------------------------------------------------------- + # Unary Methods + + # coercion + __float__ = _coerce_method(float) + __int__ = _coerce_method(int) + + # ---------------------------------------------------------------------- + + # indexers + @property + def axes(self) -> list[Index]: + """ + Return a list of the row axis labels. + """ + return [self.index] + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _ixs(self, i: int, axis: AxisInt = 0) -> Any: + """ + Return the i-th value or values in the Series by location. + + Parameters + ---------- + i : int + + Returns + ------- + scalar + """ + return self._values[i] + + def _slice(self, slobj: slice, axis: AxisInt = 0) -> Series: + # axis kwarg is retained for compat with NDFrame method + # _slice is *always* positional + mgr = self._mgr.get_slice(slobj, axis=axis) + out = self._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self._name + return out.__finalize__(self) + + def __getitem__(self, key): + check_dict_or_set_indexers(key) + key = com.apply_if_callable(key, self) + + if key is Ellipsis: + if using_copy_on_write() or warn_copy_on_write(): + return self.copy(deep=False) + return self + + key_is_scalar = is_scalar(key) + if isinstance(key, (list, tuple)): + key = unpack_1tuple(key) + + if is_integer(key) and self.index._should_fallback_to_positional: + warnings.warn( + # GH#50617 + "Series.__getitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To access " + "a value by position, use `ser.iloc[pos]`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._values[key] + + elif key_is_scalar: + return self._get_value(key) + + # Convert generator to list before going through hashable part + # (We will iterate through the generator there to check for slices) + if is_iterator(key): + key = list(key) + + if is_hashable(key) and not isinstance(key, slice): + # Otherwise index.get_value will raise InvalidIndexError + try: + # For labels that don't resolve as scalars like tuples and frozensets + result = self._get_value(key) + + return result + + except (KeyError, TypeError, InvalidIndexError): + # InvalidIndexError for e.g. generator + # see test_series_getitem_corner_generator + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # We still have the corner case where a tuple is a key + # in the first level of our MultiIndex + return self._get_values_tuple(key) + + if isinstance(key, slice): + # Do slice check before somewhat-costly is_bool_indexer + return self._getitem_slice(key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) + return self._get_rows_with_mask(key) + + return self._get_with(key) + + def _get_with(self, key): + # other: fancy integer or otherwise + if isinstance(key, ABCDataFrame): + raise TypeError( + "Indexing a Series with DataFrame is not " + "supported, use the appropriate DataFrame column" + ) + elif isinstance(key, tuple): + return self._get_values_tuple(key) + + elif not is_list_like(key): + # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 + return self.loc[key] + + if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): + key = list(key) + + key_type = lib.infer_dtype(key, skipna=False) + + # Note: The key_type == "boolean" case should be caught by the + # com.is_bool_indexer check in __getitem__ + if key_type == "integer": + # We need to decide whether to treat this as a positional indexer + # (i.e. self.iloc) or label-based (i.e. self.loc) + if not self.index._should_fallback_to_positional: + return self.loc[key] + else: + warnings.warn( + # GH#50617 + "Series.__getitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To access " + "a value by position, use `ser.iloc[pos]`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.iloc[key] + + # handle the dup indexing case GH#4246 + return self.loc[key] + + def _get_values_tuple(self, key: tuple): + # mpl hackaround + if com.any_none(*key): + # mpl compat if we look up e.g. ser[:, np.newaxis]; + # see tests.series.timeseries.test_mpl_compat_hack + # the asarray is needed to avoid returning a 2D DatetimeArray + result = np.asarray(self._values[key]) + disallow_ndim_indexing(result) + return result + + if not isinstance(self.index, MultiIndex): + raise KeyError("key of type tuple not found and not a MultiIndex") + + # If key is contained, would have returned by now + indexer, new_index = self.index.get_loc_level(key) + new_ser = self._constructor(self._values[indexer], index=new_index, copy=False) + if isinstance(indexer, slice): + new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] + return new_ser.__finalize__(self) + + def _get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Series: + new_mgr = self._mgr.get_rows_with_mask(indexer) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + + def _get_value(self, label, takeable: bool = False): + """ + Quickly retrieve single value at passed index label. + + Parameters + ---------- + label : object + takeable : interpret the index as indexers, default False + + Returns + ------- + scalar value + """ + if takeable: + return self._values[label] + + # Similar to Index.get_value, but we do not fall back to positional + loc = self.index.get_loc(label) + + if is_integer(loc): + return self._values[loc] + + if isinstance(self.index, MultiIndex): + mi = self.index + new_values = self._values[loc] + if len(new_values) == 1 and mi.nlevels == 1: + # If more than one level left, we can not return a scalar + return new_values[0] + + new_index = mi[loc] + new_index = maybe_droplevels(new_index, label) + new_ser = self._constructor( + new_values, index=new_index, name=self.name, copy=False + ) + if isinstance(loc, slice): + new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] + return new_ser.__finalize__(self) + + else: + return self.iloc[loc] + + def __setitem__(self, key, value) -> None: + warn = True + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= 3: + warnings.warn( + _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 + ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = 3 + if not warn_copy_on_write() and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count and ( + warn_copy_on_write() + or ( + not warn_copy_on_write() + and self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + ) + ): + warn = False + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) + + check_dict_or_set_indexers(key) + key = com.apply_if_callable(key, self) + cacher_needs_updating = self._check_is_chained_assignment_possible() + + if key is Ellipsis: + key = slice(None) + + if isinstance(key, slice): + indexer = self.index._convert_slice_indexer(key, kind="getitem") + return self._set_values(indexer, value, warn=warn) + + try: + self._set_with_engine(key, value, warn=warn) + except KeyError: + # We have a scalar (or for MultiIndex or object-dtype, scalar-like) + # key that is not present in self.index. + if is_integer(key): + if not self.index._should_fallback_to_positional: + # GH#33469 + self.loc[key] = value + else: + # positional setter + # can't use _mgr.setitem_inplace yet bc could have *both* + # KeyError and then ValueError, xref GH#45070 + warnings.warn( + # GH#50617 + "Series.__setitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To set " + "a value by position, use `ser.iloc[pos] = value`", + FutureWarning, + stacklevel=find_stack_level(), + ) + self._set_values(key, value) + else: + # GH#12862 adding a new key to the Series + self.loc[key] = value + + except (TypeError, ValueError, LossySetitemError): + # The key was OK, but we cannot set the value losslessly + indexer = self.index.get_loc(key) + self._set_values(indexer, value) + + except InvalidIndexError as err: + if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): + # cases with MultiIndex don't get here bc they raise KeyError + # e.g. test_basic_getitem_setitem_corner + raise KeyError( + "key of type tuple not found and not a MultiIndex" + ) from err + + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) + + if ( + is_list_like(value) + and len(value) != len(self) + and not isinstance(value, Series) + and not is_object_dtype(self.dtype) + ): + # Series will be reindexed to have matching length inside + # _where call below + # GH#44265 + indexer = key.nonzero()[0] + self._set_values(indexer, value) + return + + # otherwise with listlike other we interpret series[mask] = other + # as series[mask] = other[mask] + try: + self._where(~key, value, inplace=True, warn=warn) + except InvalidIndexError: + # test_where_dups + self.iloc[key] = value + return + + else: + self._set_with(key, value, warn=warn) + + if cacher_needs_updating: + self._maybe_update_cacher(inplace=True) + + def _set_with_engine(self, key, value, warn: bool = True) -> None: + loc = self.index.get_loc(key) + + # this is equivalent to self._values[key] = value + self._mgr.setitem_inplace(loc, value, warn=warn) + + def _set_with(self, key, value, warn: bool = True) -> None: + # We got here via exception-handling off of InvalidIndexError, so + # key should always be listlike at this point. + assert not isinstance(key, tuple) + + if is_iterator(key): + # Without this, the call to infer_dtype will consume the generator + key = list(key) + + if not self.index._should_fallback_to_positional: + # Regardless of the key type, we're treating it as labels + self._set_labels(key, value, warn=warn) + + else: + # Note: key_type == "boolean" should not occur because that + # should be caught by the is_bool_indexer check in __setitem__ + key_type = lib.infer_dtype(key, skipna=False) + + if key_type == "integer": + warnings.warn( + # GH#50617 + "Series.__setitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To set " + "a value by position, use `ser.iloc[pos] = value`", + FutureWarning, + stacklevel=find_stack_level(), + ) + self._set_values(key, value, warn=warn) + else: + self._set_labels(key, value, warn=warn) + + def _set_labels(self, key, value, warn: bool = True) -> None: + key = com.asarray_tuplesafe(key) + indexer: np.ndarray = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise KeyError(f"{key[mask]} not in index") + self._set_values(indexer, value, warn=warn) + + def _set_values(self, key, value, warn: bool = True) -> None: + if isinstance(key, (Index, Series)): + key = key._values + + self._mgr = self._mgr.setitem(indexer=key, value=value, warn=warn) + self._maybe_update_cacher() + + def _set_value(self, label, value, takeable: bool = False) -> None: + """ + Quickly set single value at passed label. + + If label is not contained, a new object is created with the label + placed at the end of the result index. + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed. + value : object + Scalar value. + takeable : interpret the index as indexers, default False + """ + if not takeable: + try: + loc = self.index.get_loc(label) + except KeyError: + # set using a non-recursive method + self.loc[label] = value + return + else: + loc = label + + self._set_values(loc, value) + + # ---------------------------------------------------------------------- + # Lookup Caching + + @property + def _is_cached(self) -> bool: + """Return boolean indicating if self is cached or not.""" + return getattr(self, "_cacher", None) is not None + + def _get_cacher(self): + """return my cacher or None""" + cacher = getattr(self, "_cacher", None) + if cacher is not None: + cacher = cacher[1]() + return cacher + + def _reset_cacher(self) -> None: + """ + Reset the cacher. + """ + if hasattr(self, "_cacher"): + del self._cacher + + def _set_as_cached(self, item, cacher) -> None: + """ + Set the _cacher attribute on the calling object with a weakref to + cacher. + """ + if using_copy_on_write(): + return + self._cacher = (item, weakref.ref(cacher)) + + def _clear_item_cache(self) -> None: + # no-op for Series + pass + + def _check_is_chained_assignment_possible(self) -> bool: + """ + See NDFrame._check_is_chained_assignment_possible.__doc__ + """ + if self._is_view and self._is_cached: + ref = self._get_cacher() + if ref is not None and ref._is_mixed_type: + self._check_setitem_copy(t="referent", force=True) + return True + return super()._check_is_chained_assignment_possible() + + def _maybe_update_cacher( + self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False + ) -> None: + """ + See NDFrame._maybe_update_cacher.__doc__ + """ + # for CoW, we never want to update the parent DataFrame cache + # if the Series changed, but don't keep track of any cacher + if using_copy_on_write(): + return + cacher = getattr(self, "_cacher", None) + if cacher is not None: + ref: DataFrame = cacher[1]() + + # we are trying to reference a dead referent, hence + # a copy + if ref is None: + del self._cacher + elif len(self) == len(ref) and self.name in ref.columns: + # GH#42530 self.name must be in ref.columns + # to ensure column still in dataframe + # otherwise, either self or ref has swapped in new arrays + ref._maybe_cache_changed(cacher[0], self, inplace=inplace) + else: + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) + + super()._maybe_update_cacher( + clear=clear, verify_is_copy=verify_is_copy, inplace=inplace + ) + + # ---------------------------------------------------------------------- + # Unsorted + + def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: + """ + Repeat elements of a Series. + + Returns a new Series where each element of the current Series + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + Series. + axis : None + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + Newly created Series with repeated elements. + + See Also + -------- + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.repeat(2) + 0 a + 0 a + 1 b + 1 b + 2 c + 2 c + dtype: object + >>> s.repeat([1, 2, 3]) + 0 a + 1 b + 1 b + 2 c + 2 c + 2 c + dtype: object + """ + nv.validate_repeat((), {"axis": axis}) + new_index = self.index.repeat(repeats) + new_values = self._values.repeat(repeats) + return self._constructor(new_values, index=new_index, copy=False).__finalize__( + self, method="repeat" + ) + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: Literal[False] = ..., + name: Level = ..., + inplace: Literal[False] = ..., + allow_duplicates: bool = ..., + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: Literal[True], + name: Level = ..., + inplace: Literal[False] = ..., + allow_duplicates: bool = ..., + ) -> Series: + ... + + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + name: Level = ..., + inplace: Literal[True], + allow_duplicates: bool = ..., + ) -> None: + ... + + def reset_index( + self, + level: IndexLabel | None = None, + *, + drop: bool = False, + name: Level = lib.no_default, + inplace: bool = False, + allow_duplicates: bool = False, + ) -> DataFrame | Series | None: + """ + Generate a new DataFrame or Series with the index reset. + + This is useful when the index needs to be treated as a column, or + when the index is meaningless and needs to be reset to the default + before another operation. + + Parameters + ---------- + level : int, str, tuple, or list, default optional + For a Series with a MultiIndex, only remove the specified levels + from the index. Removes all levels by default. + drop : bool, default False + Just reset the index, without inserting it as a column in + the new DataFrame. + name : object, optional + The name to use for the column containing the original Series + values. Uses ``self.name`` by default. This argument is ignored + when `drop` is True. + inplace : bool, default False + Modify the Series in place (do not create a new object). + allow_duplicates : bool, default False + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series or DataFrame or None + When `drop` is False (the default), a DataFrame is returned. + The newly created columns will come first in the DataFrame, + followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. + + See Also + -------- + DataFrame.reset_index: Analogous function for DataFrame. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4], name='foo', + ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) + + Generate a DataFrame with default index. + + >>> s.reset_index() + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To specify the name of the new column use `name`. + + >>> s.reset_index(name='values') + idx values + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To generate a new Series with the default set `drop` to True. + + >>> s.reset_index(drop=True) + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + The `level` parameter is interesting for Series with a multi-level + index. + + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), + ... np.array(['one', 'two', 'one', 'two'])] + >>> s2 = pd.Series( + ... range(4), name='foo', + ... index=pd.MultiIndex.from_arrays(arrays, + ... names=['a', 'b'])) + + To remove a specific level from the Index, use `level`. + + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 + + If `level` is not set, all levels are removed from the Index. + + >>> s2.reset_index() + a b foo + 0 bar one 0 + 1 bar two 1 + 2 baz one 2 + 3 baz two 3 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if drop: + new_index = default_index(len(self)) + if level is not None: + level_list: Sequence[Hashable] + if not isinstance(level, (tuple, list)): + level_list = [level] + else: + level_list = level + level_list = [self.index._get_level_number(lev) for lev in level_list] + if len(level_list) < self.index.nlevels: + new_index = self.index.droplevel(level_list) + + if inplace: + self.index = new_index + elif using_copy_on_write(): + new_ser = self.copy(deep=False) + new_ser.index = new_index + return new_ser.__finalize__(self, method="reset_index") + else: + return self._constructor( + self._values.copy(), index=new_index, copy=False, dtype=self.dtype + ).__finalize__(self, method="reset_index") + elif inplace: + raise TypeError( + "Cannot reset_index inplace on a Series to create a DataFrame" + ) + else: + if name is lib.no_default: + # For backwards compatibility, keep columns as [0] instead of + # [None] when self.name is None + if self.name is None: + name = 0 + else: + name = self.name + + df = self.to_frame(name) + return df.reset_index( + level=level, drop=drop, allow_duplicates=allow_duplicates + ) + return None + + # ---------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self) -> str: + """ + Return a string representation for a particular Series. + """ + # pylint: disable=invalid-repr-returned + repr_params = fmt.get_series_repr_params() + return self.to_string(**repr_params) + + @overload + def to_string( + self, + buf: None = ..., + na_rep: str = ..., + float_format: str | None = ..., + header: bool = ..., + index: bool = ..., + length: bool = ..., + dtype=..., + name=..., + max_rows: int | None = ..., + min_rows: int | None = ..., + ) -> str: + ... + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + na_rep: str = ..., + float_format: str | None = ..., + header: bool = ..., + index: bool = ..., + length: bool = ..., + dtype=..., + name=..., + max_rows: int | None = ..., + min_rows: int | None = ..., + ) -> None: + ... + + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + na_rep: str = "NaN", + float_format: str | None = None, + header: bool = True, + index: bool = True, + length: bool = False, + dtype: bool = False, + name: bool = False, + max_rows: int | None = None, + min_rows: int | None = None, + ) -> str | None: + """ + Render a string representation of the Series. + + Parameters + ---------- + buf : StringIO-like, optional + Buffer to write to. + na_rep : str, optional + String representation of NaN to use, default 'NaN'. + float_format : one-parameter function, optional + Formatter function to apply to columns' elements if they are + floats, default None. + header : bool, default True + Add the Series header (index name). + index : bool, optional + Add index (row) labels, default True. + length : bool, default False + Add the Series length. + dtype : bool, default False + Add the Series dtype. + name : bool, default False + Add the Series name if not None. + max_rows : int, optional + Maximum number of rows to show before truncating. If None, show + all. + min_rows : int, optional + The number of rows to display in a truncated repr (when number + of rows is above `max_rows`). + + Returns + ------- + str or None + String representation of Series if ``buf=None``, otherwise None. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]).to_string() + >>> ser + '0 1\\n1 2\\n2 3' + """ + formatter = fmt.SeriesFormatter( + self, + name=name, + length=length, + header=header, + index=index, + dtype=dtype, + na_rep=na_rep, + float_format=float_format, + min_rows=min_rows, + max_rows=max_rows, + ) + result = formatter.to_string() + + # catch contract violations + if not isinstance(result, str): + raise AssertionError( + "result must be of type str, type " + f"of result is {repr(type(result).__name__)}" + ) + + if buf is None: + return result + else: + if hasattr(buf, "write"): + buf.write(result) + else: + with open(buf, "w", encoding="utf-8") as f: + f.write(result) + return None + + @doc( + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples=dedent( + """Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + + Output markdown with a tabulate option. + + >>> print(s.to_markdown(tablefmt="grid")) + +----+----------+ + | | animal | + +====+==========+ + | 0 | elk | + +----+----------+ + | 1 | pig | + +----+----------+ + | 2 | dog | + +----+----------+ + | 3 | quetzal | + +----+----------+""" + ), + ) + def to_markdown( + self, + buf: IO[str] | None = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + """ + Print {klass} in Markdown-friendly format. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + mode : str, optional + Mode in which file is opened, "wt" by default. + index : bool, optional, default True + Add index (row) labels. + + {storage_options} + + **kwargs + These parameters will be passed to `tabulate \ + `_. + + Returns + ------- + str + {klass} in Markdown-friendly format. + + Notes + ----- + Requires the `tabulate `_ package. + + {examples} + """ + return self.to_frame().to_markdown( + buf, mode=mode, index=index, storage_options=storage_options, **kwargs + ) + + # ---------------------------------------------------------------------- + + def items(self) -> Iterable[tuple[Hashable, Any]]: + """ + Lazily iterate over (index, value) tuples. + + This method returns an iterable tuple (index, value). This is + convenient if you want to create a lazy iterator. + + Returns + ------- + iterable + Iterable of tuples containing the (index, value) pairs from a + Series. + + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'C']) + >>> for index, value in s.items(): + ... print(f"Index : {index}, Value : {value}") + Index : 0, Value : A + Index : 1, Value : B + Index : 2, Value : C + """ + return zip(iter(self.index), iter(self)) + + # ---------------------------------------------------------------------- + # Misc public methods + + def keys(self) -> Index: + """ + Return alias for index. + + Returns + ------- + Index + Index of the Series. + + Examples + -------- + >>> s = pd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s.keys() + Index([0, 1, 2], dtype='int64') + """ + return self.index + + @overload + def to_dict( + self, *, into: type[MutableMappingT] | MutableMappingT + ) -> MutableMappingT: + ... + + @overload + def to_dict(self, *, into: type[dict] = ...) -> dict: + ... + + # error: Incompatible default for argument "into" (default has type "type[ + # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_dict" + ) + def to_dict( + self, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] + ) -> MutableMappingT: + """ + Convert Series to {label -> value} dict or dict-like object. + + Parameters + ---------- + into : class, default dict + The collections.abc.MutableMapping subclass to use as the return + object. Can be the actual class or an empty instance of the mapping + type you want. If you want a collections.defaultdict, you must + pass it initialized. + + Returns + ------- + collections.abc.MutableMapping + Key-value representation of Series. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.to_dict() + {0: 1, 1: 2, 2: 3, 3: 4} + >>> from collections import OrderedDict, defaultdict + >>> s.to_dict(into=OrderedDict) + OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) + >>> dd = defaultdict(list) + >>> s.to_dict(into=dd) + defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) + """ + # GH16122 + into_c = com.standardize_mapping(into) + + if is_object_dtype(self.dtype) or isinstance(self.dtype, ExtensionDtype): + return into_c((k, maybe_box_native(v)) for k, v in self.items()) + else: + # Not an object dtype => all types will be the same so let the default + # indexer return native python type + return into_c(self.items()) + + def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: + """ + Convert Series to DataFrame. + + Parameters + ---------- + name : object, optional + The passed name should substitute for the series name (if it has + one). + + Returns + ------- + DataFrame + DataFrame representation of Series. + + Examples + -------- + >>> s = pd.Series(["a", "b", "c"], + ... name="vals") + >>> s.to_frame() + vals + 0 a + 1 b + 2 c + """ + columns: Index + if name is lib.no_default: + name = self.name + if name is None: + # default to [0], same as we would get with DataFrame(self) + columns = default_index(1) + else: + columns = Index([name]) + else: + columns = Index([name]) + + mgr = self._mgr.to_2d_mgr(columns) + df = self._constructor_expanddim_from_mgr(mgr, axes=mgr.axes) + return df.__finalize__(self, method="to_frame") + + def _set_name( + self, name, inplace: bool = False, deep: bool | None = None + ) -> Series: + """ + Set the Series name. + + Parameters + ---------- + name : str + inplace : bool + Whether to modify `self` directly or return a copy. + deep : bool|None, default None + Whether to do a deep copy, a shallow copy, or Copy on Write(None) + """ + inplace = validate_bool_kwarg(inplace, "inplace") + ser = self if inplace else self.copy(deep and not using_copy_on_write()) + ser.name = name + return ser + + @Appender( + dedent( + """ + Examples + -------- + >>> ser = pd.Series([390., 350., 30., 20.], + ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Parrot 30.0 + Parrot 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(["a", "b", "a", "b"]).mean() + a 210.0 + b 185.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(ser > 100).mean() + Max Speed + False 25.0 + True 370.0 + Name: Max Speed, dtype: float64 + + **Grouping by Indexes** + + We can groupby different levels of a hierarchical index + using the `level` parameter: + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") + >>> ser + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Animal + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level="Type").mean() + Type + Captive 210.0 + Wild 185.0 + Name: Max Speed, dtype: float64 + + We can also choose to include `NA` in group keys or not by defining + `dropna` parameter, the default setting is `True`. + + >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + >>> ser.groupby(level=0).sum() + a 3 + b 3 + dtype: int64 + + >>> ser.groupby(level=0, dropna=False).sum() + a 3 + b 3 + NaN 3 + dtype: int64 + + >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] + >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") + >>> ser.groupby(["a", "b", "a", np.nan]).mean() + a 210.0 + b 350.0 + Name: Max Speed, dtype: float64 + + >>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() + a 210.0 + b 350.0 + NaN 20.0 + Name: Max Speed, dtype: float64 + """ + ) + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis: Axis = 0, + level: IndexLabel | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool | lib.NoDefault = lib.no_default, + dropna: bool = True, + ) -> SeriesGroupBy: + from pandas.core.groupby.generic import SeriesGroupBy + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + if not as_index: + raise TypeError("as_index=False only valid with DataFrame") + axis = self._get_axis_number(axis) + + return SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + observed=observed, + dropna=dropna, + ) + + # ---------------------------------------------------------------------- + # Statistics, overridden ndarray methods + + # TODO: integrate bottleneck + def count(self) -> int: + """ + Return number of non-NA/null observations in the Series. + + Returns + ------- + int + Number of non-null values in the Series. + + See Also + -------- + DataFrame.count : Count non-NA cells for each column or row. + + Examples + -------- + >>> s = pd.Series([0.0, 1.0, np.nan]) + >>> s.count() + 2 + """ + return notna(self._values).sum().astype("int64") + + def mode(self, dropna: bool = True) -> Series: + """ + Return the mode(s) of the Series. + + The mode is the value that appears most often. There can be multiple modes. + + Always returns Series even if only one value is returned. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NaN/NaT. + + Returns + ------- + Series + Modes of the Series in sorted order. + + Examples + -------- + >>> s = pd.Series([2, 4, 2, 2, 4, None]) + >>> s.mode() + 0 2.0 + dtype: float64 + + More than one mode: + + >>> s = pd.Series([2, 4, 8, 2, 4, None]) + >>> s.mode() + 0 2.0 + 1 4.0 + dtype: float64 + + With and without considering null value: + + >>> s = pd.Series([2, 4, None, None, 4, None]) + >>> s.mode(dropna=False) + 0 NaN + dtype: float64 + >>> s = pd.Series([2, 4, None, None, 4, None]) + >>> s.mode() + 0 4.0 + dtype: float64 + """ + # TODO: Add option for bins like value_counts() + values = self._values + if isinstance(values, np.ndarray): + res_values = algorithms.mode(values, dropna=dropna) + else: + res_values = values._mode(dropna=dropna) + + # Ensure index is type stable (should always use int index) + return self._constructor( + res_values, + index=range(len(res_values)), + name=self.name, + copy=False, + dtype=self.dtype, + ).__finalize__(self, method="mode") + + def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation + """ + Return unique values of Series object. + + Uniques are returned in order of appearance. Hash table-based unique, + therefore does NOT sort. + + Returns + ------- + ndarray or ExtensionArray + The unique values returned as a NumPy array. See Notes. + + See Also + -------- + Series.drop_duplicates : Return Series with duplicate values removed. + unique : Top-level unique method for any 1-d array-like object. + Index.unique : Return Index with unique values from an Index object. + + Notes + ----- + Returns the unique values as a NumPy array. In case of an + extension-array backed Series, a new + :class:`~api.extensions.ExtensionArray` of that type with just + the unique values is returned. This includes + + * Categorical + * Period + * Datetime with Timezone + * Datetime without Timezone + * Timedelta + * Interval + * Sparse + * IntegerNA + + See Examples section. + + Examples + -------- + >>> pd.Series([2, 1, 3, 3], name='A').unique() + array([2, 1, 3]) + + >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() + + ['2016-01-01 00:00:00'] + Length: 1, dtype: datetime64[ns] + + >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') + ... for _ in range(3)]).unique() + + ['2016-01-01 00:00:00-05:00'] + Length: 1, dtype: datetime64[ns, US/Eastern] + + An Categorical will return categories in the order of + appearance and with the same dtype. + + >>> pd.Series(pd.Categorical(list('baabc'))).unique() + ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), + ... ordered=True)).unique() + ['b', 'a', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] + """ + return super().unique() + + @overload + def drop_duplicates( + self, + *, + keep: DropKeep = ..., + inplace: Literal[False] = ..., + ignore_index: bool = ..., + ) -> Series: + ... + + @overload + def drop_duplicates( + self, *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ... + ) -> None: + ... + + @overload + def drop_duplicates( + self, *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ... + ) -> Series | None: + ... + + def drop_duplicates( + self, + *, + keep: DropKeep = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> Series | None: + """ + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + If ``True``, performs operation inplace and returns None. + + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 2.0.0 + + Returns + ------- + Series or None + Series with duplicates dropped or None if ``inplace=True``. + + See Also + -------- + Index.drop_duplicates : Equivalent method on Index. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Series.duplicated : Related method on Series, indicating duplicate + Series values. + Series.unique : Return unique values as an array. + + Examples + -------- + Generate a Series with duplicated entries. + + >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], + ... name='animal') + >>> s + 0 llama + 1 cow + 2 llama + 3 beetle + 4 llama + 5 hippo + Name: animal, dtype: object + + With the 'keep' parameter, the selection behaviour of duplicated values + can be changed. The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> s.drop_duplicates() + 0 llama + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + + The value 'last' for parameter 'keep' keeps the last occurrence for + each set of duplicated entries. + + >>> s.drop_duplicates(keep='last') + 1 cow + 3 beetle + 4 llama + 5 hippo + Name: animal, dtype: object + + The value ``False`` for parameter 'keep' discards all sets of + duplicated entries. + + >>> s.drop_duplicates(keep=False) + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + """ + inplace = validate_bool_kwarg(inplace, "inplace") + result = super().drop_duplicates(keep=keep) + + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + self._update_inplace(result) + return None + else: + return result + + def duplicated(self, keep: DropKeep = "first") -> Series: + """ + Indicate duplicate Series values. + + Duplicated values are indicated as ``True`` values in the resulting + Series. Either all duplicates, all except the first or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + Series[bool] + Series indicating whether each value has occurred in the + preceding values. + + See Also + -------- + Index.duplicated : Equivalent method on pandas.Index. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Series.drop_duplicates : Remove duplicate values from Series. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set on False and all others on True: + + >>> animals = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama']) + >>> animals.duplicated() + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + which is equivalent to + + >>> animals.duplicated(keep='first') + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> animals.duplicated(keep='last') + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + By setting keep on ``False``, all duplicates are True: + + >>> animals.duplicated(keep=False) + 0 True + 1 False + 2 True + 3 False + 4 True + dtype: bool + """ + res = self._duplicated(keep=keep) + result = self._constructor(res, index=self.index, copy=False) + return result.__finalize__(self, method="duplicated") + + def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable: + """ + Return the row label of the minimum value. + + If multiple values equal the minimum, the first row label with that + value is returned. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + Index + Label of the minimum value. + + Raises + ------ + ValueError + If the Series is empty. + + See Also + -------- + numpy.argmin : Return indices of the minimum values + along the given axis. + DataFrame.idxmin : Return index of first occurrence of minimum + over requested axis. + Series.idxmax : Return index *label* of the first occurrence + of maximum of values. + + Notes + ----- + This method is the Series version of ``ndarray.argmin``. This method + returns the label of the minimum, while ``ndarray.argmin`` returns + the position. To get the position, use ``series.values.argmin()``. + + Examples + -------- + >>> s = pd.Series(data=[1, None, 4, 1], + ... index=['A', 'B', 'C', 'D']) + >>> s + A 1.0 + B NaN + C 4.0 + D 1.0 + dtype: float64 + + >>> s.idxmin() + 'A' + + If `skipna` is False and there is an NA value in the data, + the function returns ``nan``. + + >>> s.idxmin(skipna=False) + nan + """ + axis = self._get_axis_number(axis) + with warnings.catch_warnings(): + # TODO(3.0): this catching/filtering can be removed + # ignore warning produced by argmin since we will issue a different + # warning for idxmin + warnings.simplefilter("ignore") + i = self.argmin(axis, skipna, *args, **kwargs) + + if i == -1: + # GH#43587 give correct NA value for Index. + warnings.warn( + f"The behavior of {type(self).__name__}.idxmin with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.index._na_value + return self.index[i] + + def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable: + """ + Return the row label of the maximum value. + + If multiple values equal the maximum, the first row label with that + value is returned. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + Index + Label of the maximum value. + + Raises + ------ + ValueError + If the Series is empty. + + See Also + -------- + numpy.argmax : Return indices of the maximum values + along the given axis. + DataFrame.idxmax : Return index of first occurrence of maximum + over requested axis. + Series.idxmin : Return index *label* of the first occurrence + of minimum of values. + + Notes + ----- + This method is the Series version of ``ndarray.argmax``. This method + returns the label of the maximum, while ``ndarray.argmax`` returns + the position. To get the position, use ``series.values.argmax()``. + + Examples + -------- + >>> s = pd.Series(data=[1, None, 4, 3, 4], + ... index=['A', 'B', 'C', 'D', 'E']) + >>> s + A 1.0 + B NaN + C 4.0 + D 3.0 + E 4.0 + dtype: float64 + + >>> s.idxmax() + 'C' + + If `skipna` is False and there is an NA value in the data, + the function returns ``nan``. + + >>> s.idxmax(skipna=False) + nan + """ + axis = self._get_axis_number(axis) + with warnings.catch_warnings(): + # TODO(3.0): this catching/filtering can be removed + # ignore warning produced by argmax since we will issue a different + # warning for argmax + warnings.simplefilter("ignore") + i = self.argmax(axis, skipna, *args, **kwargs) + + if i == -1: + # GH#43587 give correct NA value for Index. + warnings.warn( + f"The behavior of {type(self).__name__}.idxmax with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.index._na_value + return self.index[i] + + def round(self, decimals: int = 0, *args, **kwargs) -> Series: + """ + Round each value in a Series to the given number of decimals. + + Parameters + ---------- + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + Series + Rounded values of the Series. + + See Also + -------- + numpy.around : Round values of an np.array. + DataFrame.round : Round values of a DataFrame. + + Examples + -------- + >>> s = pd.Series([0.1, 1.3, 2.7]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + """ + nv.validate_round(args, kwargs) + new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( + self, method="round" + ) + + @overload + def quantile( + self, q: float = ..., interpolation: QuantileInterpolation = ... + ) -> float: + ... + + @overload + def quantile( + self, + q: Sequence[float] | AnyArrayLike, + interpolation: QuantileInterpolation = ..., + ) -> Series: + ... + + @overload + def quantile( + self, + q: float | Sequence[float] | AnyArrayLike = ..., + interpolation: QuantileInterpolation = ..., + ) -> float | Series: + ... + + def quantile( + self, + q: float | Sequence[float] | AnyArrayLike = 0.5, + interpolation: QuantileInterpolation = "linear", + ) -> float | Series: + """ + Return value at the given quantile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * (x-i)/(j-i)`, where `(x-i)/(j-i)` is + the fractional part of the index surrounded by `i > j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + float or Series + If ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + + See Also + -------- + core.window.Rolling.quantile : Calculate the rolling quantile. + numpy.percentile : Returns the q-th percentile(s) of the array elements. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 + """ + validate_percentile(q) + + # We dispatch to DataFrame so that core.internals only has to worry + # about 2D cases. + df = self.to_frame() + + result = df.quantile(q=q, interpolation=interpolation, numeric_only=False) + if result.ndim == 2: + result = result.iloc[:, 0] + + if is_list_like(q): + result.name = self.name + idx = Index(q, dtype=np.float64) + return self._constructor(result, index=idx, name=self.name) + else: + # scalar + return result.iloc[0] + + def corr( + self, + other: Series, + method: CorrelationMethod = "pearson", + min_periods: int | None = None, + ) -> float: + """ + Compute correlation with `other` Series, excluding missing values. + + The two `Series` objects are not required to be the same length and will be + aligned internally before the correlation function is applied. + + Parameters + ---------- + other : Series + Series with which to compute the correlation. + method : {'pearson', 'kendall', 'spearman'} or callable + Method used to compute correlation: + + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. warning:: + Note that the returned matrix from corr will have 1 along the + diagonals and will be symmetric regardless of the callable's + behavior. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Correlation with other. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + + Automatic data alignment: as with all pandas operations, automatic data alignment is performed for this method. + ``corr()`` automatically considers values with matching indices. + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> s1 = pd.Series([.2, .0, .6, .2]) + >>> s2 = pd.Series([.3, .6, .0, .1]) + >>> s1.corr(s2, method=histogram_intersection) + 0.3 + + Pandas auto-aligns the values with matching indices + + >>> s1 = pd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) + >>> s1.corr(s2) + -1.0 + """ # noqa: E501 + this, other = self.align(other, join="inner", copy=False) + if len(this) == 0: + return np.nan + + this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) + other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) + + if method in ["pearson", "spearman", "kendall"] or callable(method): + return nanops.nancorr( + this_values, other_values, method=method, min_periods=min_periods + ) + + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) + + def cov( + self, + other: Series, + min_periods: int | None = None, + ddof: int | None = 1, + ) -> float: + """ + Compute covariance with Series, excluding missing values. + + The two `Series` objects are not required to be the same length and + will be aligned internally before the covariance is calculated. + + Parameters + ---------- + other : Series + Series with which to compute the covariance. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + Returns + ------- + float + Covariance between Series and other normalized by N-1 + (unbiased estimator). + + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + + Examples + -------- + >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) + >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198]) + >>> s1.cov(s2) + -0.01685762652715874 + """ + this, other = self.align(other, join="inner", copy=False) + if len(this) == 0: + return np.nan + this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) + other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) + return nanops.nancov( + this_values, other_values, min_periods=min_periods, ddof=ddof + ) + + @doc( + klass="Series", + extra_params="", + other_klass="DataFrame", + examples=dedent( + """ + Difference with previous row + + >>> s = pd.Series([1, 1, 2, 3, 5, 8]) + >>> s.diff() + 0 NaN + 1 0.0 + 2 1.0 + 3 1.0 + 4 2.0 + 5 3.0 + dtype: float64 + + Difference with 3rd previous row + + >>> s.diff(periods=3) + 0 NaN + 1 NaN + 2 NaN + 3 2.0 + 4 4.0 + 5 6.0 + dtype: float64 + + Difference with following row + + >>> s.diff(periods=-1) + 0 0.0 + 1 -1.0 + 2 -1.0 + 3 -2.0 + 4 -3.0 + 5 NaN + dtype: float64 + + Overflow in input dtype + + >>> s = pd.Series([1, 0], dtype=np.uint8) + >>> s.diff() + 0 NaN + 1 255.0 + dtype: float64""" + ), + ) + def diff(self, periods: int = 1) -> Series: + """ + First discrete difference of element. + + Calculates the difference of a {klass} element compared with another + element in the {klass} (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative + values. + {extra_params} + Returns + ------- + {klass} + First differences of the Series. + + See Also + -------- + {klass}.pct_change: Percent change over given number of periods. + {klass}.shift: Shift index by desired number of periods with an + optional time freq. + {other_klass}.diff: First discrete difference of object. + + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + The result is calculated according to current dtype in {klass}, + however dtype of the result is always float64. + + Examples + -------- + {examples} + """ + result = algorithms.diff(self._values, periods) + return self._constructor(result, index=self.index, copy=False).__finalize__( + self, method="diff" + ) + + def autocorr(self, lag: int = 1) -> float: + """ + Compute the lag-N autocorrelation. + + This method computes the Pearson correlation between + the Series and its shifted self. + + Parameters + ---------- + lag : int, default 1 + Number of lags to apply before performing autocorrelation. + + Returns + ------- + float + The Pearson correlation between self and self.shift(lag). + + See Also + -------- + Series.corr : Compute the correlation between two Series. + Series.shift : Shift index by desired number of periods. + DataFrame.corr : Compute pairwise correlation of columns. + DataFrame.corrwith : Compute pairwise correlation between rows or + columns of two DataFrame objects. + + Notes + ----- + If the Pearson correlation is not well defined return 'NaN'. + + Examples + -------- + >>> s = pd.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr() # doctest: +ELLIPSIS + 0.10355... + >>> s.autocorr(lag=2) # doctest: +ELLIPSIS + -0.99999... + + If the Pearson correlation is not well defined, then 'NaN' is returned. + + >>> s = pd.Series([1, 0, 0, 0]) + >>> s.autocorr() + nan + """ + return self.corr(cast(Series, self.shift(lag))) + + def dot(self, other: AnyArrayLike) -> Series | np.ndarray: + """ + Compute the dot product between the Series and the columns of other. + + This method computes the dot product between the Series and another + one, or the Series and each columns of a DataFrame, or the Series and + each columns of an array. + + It can also be called using `self @ other`. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the dot product with its columns. + + Returns + ------- + scalar, Series or numpy.ndarray + Return the dot product of the Series and other if other is a + Series, the Series of the dot product of Series and each rows of + other if other is a DataFrame or a numpy.ndarray between the Series + and each columns of the numpy array. + + See Also + -------- + DataFrame.dot: Compute the matrix product with the DataFrame. + Series.mul: Multiplication of series and other, element-wise. + + Notes + ----- + The Series and other has to share the same index if other is a Series + or a DataFrame. + + Examples + -------- + >>> s = pd.Series([0, 1, 2, 3]) + >>> other = pd.Series([-1, 2, -3, 4]) + >>> s.dot(other) + 8 + >>> s @ other + 8 + >>> df = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(df) + 0 24 + 1 14 + dtype: int64 + >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(arr) + array([24, 14]) + """ + if isinstance(other, (Series, ABCDataFrame)): + common = self.index.union(other.index) + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(index=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right.values + else: + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[0] != rvals.shape[0]: + raise Exception( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, ABCDataFrame): + return self._constructor( + np.dot(lvals, rvals), index=other.columns, copy=False + ).__finalize__(self, method="dot") + elif isinstance(other, Series): + return np.dot(lvals, rvals) + elif isinstance(rvals, np.ndarray): + return np.dot(lvals, rvals) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + def __matmul__(self, other): + """ + Matrix multiplication using binary `@` operator. + """ + return self.dot(other) + + def __rmatmul__(self, other): + """ + Matrix multiplication using binary `@` operator. + """ + return self.dot(np.transpose(other)) + + @doc(base.IndexOpsMixin.searchsorted, klass="Series") + # Signature of "searchsorted" incompatible with supertype "IndexOpsMixin" + def searchsorted( # type: ignore[override] + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + return base.IndexOpsMixin.searchsorted(self, value, side=side, sorter=sorter) + + # ------------------------------------------------------------------- + # Combination + + def _append( + self, to_append, ignore_index: bool = False, verify_integrity: bool = False + ): + from pandas.core.reshape.concat import concat + + if isinstance(to_append, (list, tuple)): + to_concat = [self] + to_concat.extend(to_append) + else: + to_concat = [self, to_append] + if any(isinstance(x, (ABCDataFrame,)) for x in to_concat[1:]): + msg = "to_append should be a Series or list/tuple of Series, got DataFrame" + raise TypeError(msg) + return concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) + + @doc( + _shared_docs["compare"], + dedent( + """ + Returns + ------- + Series or DataFrame + If axis is 0 or 'index' the result will be a Series. + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + + If axis is 1 or 'columns' the result will be a DataFrame. + It will have two columns namely 'self' and 'other'. + + See Also + -------- + DataFrame.compare : Compare with another DataFrame and show differences. + + Notes + ----- + Matching NaNs will not appear as a difference. + + Examples + -------- + >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) + >>> s2 = pd.Series(["a", "a", "c", "b", "e"]) + + Align the differences on columns + + >>> s1.compare(s2) + self other + 1 b a + 3 d b + + Stack the differences on indices + + >>> s1.compare(s2, align_axis=0) + 1 self b + other a + 3 self d + other b + dtype: object + + Keep all original rows + + >>> s1.compare(s2, keep_shape=True) + self other + 0 NaN NaN + 1 b a + 2 NaN NaN + 3 d b + 4 NaN NaN + + Keep all original rows and also all original values + + >>> s1.compare(s2, keep_shape=True, keep_equal=True) + self other + 0 a a + 1 b a + 2 c c + 3 d b + 4 e e + """ + ), + klass=_shared_doc_kwargs["klass"], + ) + def compare( + self, + other: Series, + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, + result_names: Suffixes = ("self", "other"), + ) -> DataFrame | Series: + return super().compare( + other=other, + align_axis=align_axis, + keep_shape=keep_shape, + keep_equal=keep_equal, + result_names=result_names, + ) + + def combine( + self, + other: Series | Hashable, + func: Callable[[Hashable, Hashable], Hashable], + fill_value: Hashable | None = None, + ) -> Series: + """ + Combine the Series with a Series or scalar according to `func`. + + Combine the Series and `other` using `func` to perform elementwise + selection for combined Series. + `fill_value` is assumed when value is missing at some index + from one of the two objects being combined. + + Parameters + ---------- + other : Series or scalar + The value(s) to be combined with the `Series`. + func : function + Function that takes two scalars as inputs and returns an element. + fill_value : scalar, optional + The value to assume when an index is missing from + one Series or the other. The default specifies to use the + appropriate NaN value for the underlying dtype of the Series. + + Returns + ------- + Series + The result of combining the Series with the other object. + + See Also + -------- + Series.combine_first : Combine Series values, choosing the calling + Series' values first. + + Examples + -------- + Consider 2 Datasets ``s1`` and ``s2`` containing + highest clocked speeds of different birds. + + >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 + falcon 330.0 + eagle 160.0 + dtype: float64 + >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 + falcon 345.0 + eagle 200.0 + duck 30.0 + dtype: float64 + + Now, to combine the two datasets and view the highest speeds + of the birds across the two datasets + + >>> s1.combine(s2, max) + duck NaN + eagle 200.0 + falcon 345.0 + dtype: float64 + + In the previous example, the resulting value for duck is missing, + because the maximum of a NaN and a float is a NaN. + So, in the example, we set ``fill_value=0``, + so the maximum value returned will be the value from some dataset. + + >>> s1.combine(s2, max, fill_value=0) + duck 30.0 + eagle 200.0 + falcon 345.0 + dtype: float64 + """ + if fill_value is None: + fill_value = na_value_for_dtype(self.dtype, compat=False) + + if isinstance(other, Series): + # If other is a Series, result is based on union of Series, + # so do this element by element + new_index = self.index.union(other.index) + new_name = ops.get_op_result_name(self, other) + new_values = np.empty(len(new_index), dtype=object) + with np.errstate(all="ignore"): + for i, idx in enumerate(new_index): + lv = self.get(idx, fill_value) + rv = other.get(idx, fill_value) + new_values[i] = func(lv, rv) + else: + # Assume that other is a scalar, so apply the function for + # each element in the Series + new_index = self.index + new_values = np.empty(len(new_index), dtype=object) + with np.errstate(all="ignore"): + new_values[:] = [func(lv, other) for lv in self._values] + new_name = self.name + + # try_float=False is to match agg_series + npvalues = lib.maybe_convert_objects(new_values, try_float=False) + # same_dtype here is a kludge to avoid casting e.g. [True, False] to + # ["True", "False"] + same_dtype = isinstance(self.dtype, (StringDtype, CategoricalDtype)) + res_values = maybe_cast_pointwise_result( + npvalues, self.dtype, same_dtype=same_dtype + ) + return self._constructor(res_values, index=new_index, name=new_name, copy=False) + + def combine_first(self, other) -> Series: + """ + Update null elements with value in the same location in 'other'. + + Combine two Series objects by filling null values in one Series with + non-null values from the other Series. Result index will be the union + of the two indexes. + + Parameters + ---------- + other : Series + The value(s) to be used for filling null values. + + Returns + ------- + Series + The result of combining the provided Series with the other object. + + See Also + -------- + Series.combine : Perform element-wise operation on two Series + using a given function. + + Examples + -------- + >>> s1 = pd.Series([1, np.nan]) + >>> s2 = pd.Series([3, 4, 5]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + 2 5.0 + dtype: float64 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> s1 = pd.Series({'falcon': np.nan, 'eagle': 160.0}) + >>> s2 = pd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1.combine_first(s2) + duck 30.0 + eagle 160.0 + falcon NaN + dtype: float64 + """ + from pandas.core.reshape.concat import concat + + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + + new_index = self.index.union(other.index) + + this = self + # identify the index subset to keep for each series + keep_other = other.index.difference(this.index[notna(this)]) + keep_this = this.index.difference(keep_other) + + this = this.reindex(keep_this, copy=False) + other = other.reindex(keep_other, copy=False) + + if this.dtype.kind == "M" and other.dtype.kind != "M": + other = to_datetime(other) + combined = concat([this, other]) + combined = combined.reindex(new_index, copy=False) + return combined.__finalize__(self, method="combine_first") + + def update(self, other: Series | Sequence | Mapping) -> None: + """ + Modify Series in place using values from passed Series. + + Uses non-NA values from passed Series to make updates. Aligns + on index. + + Parameters + ---------- + other : Series, or object coercible into Series + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + >>> s = pd.Series(['a', 'b', 'c']) + >>> s.update(pd.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, np.nan, 6])) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = pd.Series([1, 2, 3]) + >>> s.update([4, np.nan, 6]) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + >>> s = pd.Series([1, 2, 3]) + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: int64 + """ + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + + if not isinstance(other, Series): + other = Series(other) + + other = other.reindex_like(self) + mask = notna(other) + + self._mgr = self._mgr.putmask(mask=mask, new=other) + self._maybe_update_cacher() + + # ---------------------------------------------------------------------- + # Reindexing, sorting + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> Series: + ... + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_values( + self, + *, + axis: Axis = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> Series | None: + ... + + def sort_values( + self, + *, + axis: Axis = 0, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + ignore_index: bool = False, + key: ValueKeyFunc | None = None, + ) -> Series | None: + """ + Sort by the values. + + Sort a Series in ascending or descending order by some + criterion. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + ascending : bool or list of bools, default True + If True, sort values in ascending order, otherwise descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable algorithms. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + If not None, apply the key function to the series values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return an array-like. + + Returns + ------- + Series or None + Series ordered by values or None if ``inplace=True``. + + See Also + -------- + Series.sort_index : Sort by the Series indices. + DataFrame.sort_values : Sort DataFrame by the values along either axis. + DataFrame.sort_index : Sort DataFrame by indices. + + Examples + -------- + >>> s = pd.Series([np.nan, 1, 3, 10, 5]) + >>> s + 0 NaN + 1 1.0 + 2 3.0 + 3 10.0 + 4 5.0 + dtype: float64 + + Sort values ascending order (default behaviour) + + >>> s.sort_values(ascending=True) + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + 0 NaN + dtype: float64 + + Sort values descending order + + >>> s.sort_values(ascending=False) + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 NaN + dtype: float64 + + Sort values putting NAs first + + >>> s.sort_values(na_position='first') + 0 NaN + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + dtype: float64 + + Sort a series of strings + + >>> s = pd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s + 0 z + 1 b + 2 d + 3 a + 4 c + dtype: object + + >>> s.sort_values() + 3 a + 1 b + 4 c + 2 d + 0 z + dtype: object + + Sort using a key function. Your `key` function will be + given the ``Series`` of values and should return an array-like. + + >>> s = pd.Series(['a', 'B', 'c', 'D', 'e']) + >>> s.sort_values() + 1 B + 3 D + 0 a + 2 c + 4 e + dtype: object + >>> s.sort_values(key=lambda x: x.str.lower()) + 0 a + 1 B + 2 c + 3 D + 4 e + dtype: object + + NumPy ufuncs work well here. For example, we can + sort by the ``sin`` of the value + + >>> s = pd.Series([-4, -2, 0, 2, 4]) + >>> s.sort_values(key=np.sin) + 1 -2 + 4 4 + 2 0 + 0 -4 + 3 2 + dtype: int64 + + More complicated user-defined functions can be used, + as long as they expect a Series and return an array-like + + >>> s.sort_values(key=lambda x: (np.tan(x.cumsum()))) + 0 -4 + 3 2 + 4 4 + 1 -2 + 2 0 + dtype: int64 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + # Validate the axis parameter + self._get_axis_number(axis) + + # GH 5856/5853 + if inplace and self._is_cached: + raise ValueError( + "This Series is a view of some other array, to " + "sort in-place you must create a copy" + ) + + if is_list_like(ascending): + ascending = cast(Sequence[bool], ascending) + if len(ascending) != 1: + raise ValueError( + f"Length of ascending ({len(ascending)}) must be 1 for Series" + ) + ascending = ascending[0] + + ascending = validate_ascending(ascending) + + if na_position not in ["first", "last"]: + raise ValueError(f"invalid na_position: {na_position}") + + # GH 35922. Make sorting stable by leveraging nargsort + if key: + values_to_sort = cast(Series, ensure_key_mapped(self, key))._values + else: + values_to_sort = self._values + sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position) + + if is_range_indexer(sorted_index, len(sorted_index)): + if inplace: + return self._update_inplace(self) + return self.copy(deep=None) + + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index], copy=False + ) + + if ignore_index: + result.index = default_index(len(sorted_index)) + + if not inplace: + return result.__finalize__(self, method="sort_values") + self._update_inplace(result) + return None + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> Series: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> Series | None: + ... + + def sort_index( + self, + *, + axis: Axis = 0, + level: IndexLabel | None = None, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + sort_remaining: bool = True, + ignore_index: bool = False, + key: IndexKeyFunc | None = None, + ) -> Series | None: + """ + Sort Series by index labels. + + Returns a new Series sorted by label if `inplace` argument is + ``False``, otherwise updates the original series and returns None. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + level : int, optional + If not None, sort on values in specified index level(s). + ascending : bool or list-like of bools, default True + Sort ascending vs. descending. When the index is a MultiIndex the + sort direction can be controlled for each level individually. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. + + Returns + ------- + Series or None + The original Series sorted by the labels or None if ``inplace=True``. + + See Also + -------- + DataFrame.sort_index: Sort DataFrame by the index. + DataFrame.sort_values: Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> s.sort_index() + 1 c + 2 b + 3 a + 4 d + dtype: object + + Sort Descending + + >>> s.sort_index(ascending=False) + 4 d + 3 a + 2 b + 1 c + dtype: object + + By default NaNs are put at the end, but use `na_position` to place + them at the beginning + + >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan]) + >>> s.sort_index(na_position='first') + NaN d + 1.0 c + 2.0 b + 3.0 a + dtype: object + + Specify index level to sort + + >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo', + ... 'baz', 'baz', 'bar', 'bar']), + ... np.array(['two', 'one', 'two', 'one', + ... 'two', 'one', 'two', 'one'])] + >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays) + >>> s.sort_index(level=1) + bar one 8 + baz one 6 + foo one 4 + qux one 2 + bar two 7 + baz two 5 + foo two 3 + qux two 1 + dtype: int64 + + Does not sort by remaining levels when sorting by levels + + >>> s.sort_index(level=1, sort_remaining=False) + qux one 2 + foo one 4 + baz one 6 + bar one 8 + qux two 1 + foo two 3 + baz two 5 + bar two 7 + dtype: int64 + + Apply a key function before sorting + + >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd']) + >>> s.sort_index(key=lambda x : x.str.lower()) + A 1 + b 2 + C 3 + d 4 + dtype: int64 + """ + + return super().sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, + ) + + def argsort( + self, + axis: Axis = 0, + kind: SortKind = "quicksort", + order: None = None, + stable: None = None, + ) -> Series: + """ + Return the integer indices that would sort the Series values. + + Override ndarray.argsort. Argsorts the value, omitting NA/null values, + and places the result in the same locations as the non-NA values. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable algorithms. + order : None + Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. + + Returns + ------- + Series[np.intp] + Positions of values within the sort order with -1 indicating + nan values. + + See Also + -------- + numpy.ndarray.argsort : Returns the indices that would sort this array. + + Examples + -------- + >>> s = pd.Series([3, 2, 1]) + >>> s.argsort() + 0 2 + 1 1 + 2 0 + dtype: int64 + """ + if axis != -1: + # GH#54257 We allow -1 here so that np.argsort(series) works + self._get_axis_number(axis) + + values = self._values + mask = isna(values) + + if mask.any(): + # TODO(3.0): once this deprecation is enforced we can call + # self.array.argsort directly, which will close GH#43840 and + # GH#12694 + warnings.warn( + "The behavior of Series.argsort in the presence of NA values is " + "deprecated. In a future version, NA values will be ordered " + "last instead of set to -1.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = np.full(len(self), -1, dtype=np.intp) + notmask = ~mask + result[notmask] = np.argsort(values[notmask], kind=kind) + else: + result = np.argsort(values, kind=kind) + + res = self._constructor( + result, index=self.index, name=self.name, dtype=np.intp, copy=False + ) + return res.__finalize__(self, method="argsort") + + def nlargest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + """ + Return the largest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many descending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` largest values in the Series, sorted in decreasing order. + + See Also + -------- + Series.nsmallest: Get the `n` smallest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values(ascending=False).head(n)`` for small `n` + relative to the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Malta": 434000, "Maldives": 434000, + ... "Brunei": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Montserrat": 5200} + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + + The `n` largest elements where ``n=5`` by default. + + >>> s.nlargest() + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3``. Default `keep` value is 'first' + so Malta will be kept. + + >>> s.nlargest(3) + France 65000000 + Italy 59000000 + Malta 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` and keeping the last duplicates. + Brunei will be kept since it is the last with value 434000 based on + the index order. + + >>> s.nlargest(3, keep='last') + France 65000000 + Italy 59000000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has five elements due to the three duplicates. + + >>> s.nlargest(3, keep='all') + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + """ + return selectn.SelectNSeries(self, n=n, keep=keep).nlargest() + + def nsmallest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + """ + Return the smallest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many ascending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` smallest values in the Series, sorted in increasing order. + + See Also + -------- + Series.nlargest: Get the `n` largest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values().head(n)`` for small `n` relative to + the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Brunei": 434000, "Malta": 434000, + ... "Maldives": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Montserrat": 5200} + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Brunei 434000 + Malta 434000 + Maldives 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + + The `n` smallest elements where ``n=5`` by default. + + >>> s.nsmallest() + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Iceland 337000 + dtype: int64 + + The `n` smallest elements where ``n=3``. Default `keep` value is + 'first' so Nauru and Tuvalu will be kept. + + >>> s.nsmallest(3) + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` and keeping the last + duplicates. Anguilla and Tuvalu will be kept since they are the last + with value 11300 based on the index order. + + >>> s.nsmallest(3, keep='last') + Montserrat 5200 + Anguilla 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has four elements due to the three duplicates. + + >>> s.nsmallest(3, keep='all') + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + dtype: int64 + """ + return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest() + + @doc( + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """copy : bool, default True + Whether to copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True``""" + ), + examples=dedent( + """\ + Examples + -------- + >>> s = pd.Series( + ... ["A", "B", "A", "C"], + ... index=[ + ... ["Final exam", "Final exam", "Coursework", "Coursework"], + ... ["History", "Geography", "History", "Geography"], + ... ["January", "February", "March", "April"], + ... ], + ... ) + >>> s + Final exam History January A + Geography February B + Coursework History March A + Geography April C + dtype: object + + In the following example, we will swap the levels of the indices. + Here, we will swap the levels column-wise, but levels can be swapped row-wise + in a similar manner. Note that column-wise is the default behaviour. + By not supplying any arguments for i and j, we swap the last and second to + last indices. + + >>> s.swaplevel() + Final exam January History A + February Geography B + Coursework March History A + April Geography C + dtype: object + + By supplying one argument, we can choose which index to swap the last + index with. We can for example swap the first index with the last one as + follows. + + >>> s.swaplevel(0) + January History Final exam A + February Geography Final exam B + March History Coursework A + April Geography Coursework C + dtype: object + + We can also define explicitly which indices we want to swap by supplying values + for both i and j. Here, we for example swap the first and second indices. + + >>> s.swaplevel(0, 1) + History Final exam January A + Geography Final exam February B + History Coursework March A + Geography Coursework April C + dtype: object""" + ), + ) + def swaplevel( + self, i: Level = -2, j: Level = -1, copy: bool | None = None + ) -> Series: + """ + Swap levels i and j in a :class:`MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + {extra_params} + + Returns + ------- + {klass} + {klass} with levels swapped in MultiIndex. + + {examples} + """ + assert isinstance(self.index, MultiIndex) + result = self.copy(deep=copy and not using_copy_on_write()) + result.index = self.index.swaplevel(i, j) + return result + + def reorder_levels(self, order: Sequence[Level]) -> Series: + """ + Rearrange index levels using input order. + + May not drop or duplicate levels. + + Parameters + ---------- + order : list of int representing new level order + Reference level by number or key. + + Returns + ------- + type of caller (new object) + + Examples + -------- + >>> arrays = [np.array(["dog", "dog", "cat", "cat", "bird", "bird"]), + ... np.array(["white", "black", "white", "black", "white", "black"])] + >>> s = pd.Series([1, 2, 3, 3, 5, 2], index=arrays) + >>> s + dog white 1 + black 2 + cat white 3 + black 3 + bird white 5 + black 2 + dtype: int64 + >>> s.reorder_levels([1, 0]) + white dog 1 + black dog 2 + white cat 3 + black cat 3 + white bird 5 + black bird 2 + dtype: int64 + """ + if not isinstance(self.index, MultiIndex): # pragma: no cover + raise Exception("Can only reorder levels on a hierarchical axis.") + + result = self.copy(deep=None) + assert isinstance(result.index, MultiIndex) + result.index = result.index.reorder_levels(order) + return result + + def explode(self, ignore_index: bool = False) -> Series: + """ + Transform each element of a list-like to a row. + + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + Series + Exploded lists to rows; index will be duplicated for these rows. + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex + to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, sets, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of elements in + the output will be non-deterministic when exploding sets. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s + 0 [1, 2, 3] + 1 foo + 2 [] + 3 [3, 4] + dtype: object + + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + if isinstance(self.dtype, ExtensionDtype): + values, counts = self._values._explode() + elif len(self) and is_object_dtype(self.dtype): + values, counts = reshape.explode(np.asarray(self._values)) + else: + result = self.copy() + return result.reset_index(drop=True) if ignore_index else result + + if ignore_index: + index: Index = default_index(len(values)) + else: + index = self.index.repeat(counts) + + return self._constructor(values, index=index, name=self.name, copy=False) + + def unstack( + self, + level: IndexLabel = -1, + fill_value: Hashable | None = None, + sort: bool = True, + ) -> DataFrame: + """ + Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + + Parameters + ---------- + level : int, str, or list of these, default last level + Level(s) to unstack, can pass level name. + fill_value : scalar value, default None + Value to use when replacing NaN values. + sort : bool, default True + Sort the level(s) in the resulting MultiIndex columns. + + Returns + ------- + DataFrame + Unstacked Series. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4], + ... index=pd.MultiIndex.from_product([['one', 'two'], + ... ['a', 'b']])) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 3 + b 2 4 + """ + from pandas.core.reshape.reshape import unstack + + return unstack(self, level, fill_value, sort) + + # ---------------------------------------------------------------------- + # function application + + def map( + self, + arg: Callable | Mapping | Series, + na_action: Literal["ignore"] | None = None, + ) -> Series: + """ + Map values of Series according to an input mapping or function. + + Used for substituting each value in a Series with another value, + that may be derived from a function, a ``dict`` or + a :class:`Series`. + + Parameters + ---------- + arg : function, collections.abc.Mapping subclass or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to the + mapping correspondence. + + Returns + ------- + Series + Same index as caller. + + See Also + -------- + Series.apply : For applying more complex functions on a Series. + Series.replace: Replace values given in `to_replace` with `value`. + DataFrame.apply : Apply a function row-/column-wise. + DataFrame.map : Apply a function elementwise on a whole DataFrame. + + Notes + ----- + When ``arg`` is a dictionary, values in Series that are not in the + dictionary (as keys) are converted to ``NaN``. However, if the + dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. + provides a method for default values), then this default is used + rather than ``NaN``. + + Examples + -------- + >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit']) + >>> s + 0 cat + 1 dog + 2 NaN + 3 rabbit + dtype: object + + ``map`` accepts a ``dict`` or a ``Series``. Values that are not found + in the ``dict`` are converted to ``NaN``, unless the dict has a default + value (e.g. ``defaultdict``): + + >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) + 0 kitten + 1 puppy + 2 NaN + 3 NaN + dtype: object + + It also accepts a function: + + >>> s.map('I am a {}'.format) + 0 I am a cat + 1 I am a dog + 2 I am a nan + 3 I am a rabbit + dtype: object + + To avoid applying the function to missing values (and keep them as + ``NaN``) ``na_action='ignore'`` can be used: + + >>> s.map('I am a {}'.format, na_action='ignore') + 0 I am a cat + 1 I am a dog + 2 NaN + 3 I am a rabbit + dtype: object + """ + new_values = self._map_values(arg, na_action=na_action) + return self._constructor(new_values, index=self.index, copy=False).__finalize__( + self, method="map" + ) + + def _gotitem(self, key, ndim, subset=None) -> Self: + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + Requested ndim of result. + subset : object, default None + Subset to act on. + """ + return self + + _agg_see_also_doc = dedent( + """ + See Also + -------- + Series.apply : Invoke function on a Series. + Series.transform : Transform function producing a Series with like indexes. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.agg('min') + 1 + + >>> s.agg(['min', 'max']) + min 1 + max 4 + dtype: int64 + """ + ) + + @doc( + _shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + ) + def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): + # Validate the axis parameter + self._get_axis_number(axis) + + # if func is None, will switch to user-provided "named aggregation" kwargs + if func is None: + func = dict(kwargs.items()) + + op = SeriesApply(self, func, args=args, kwargs=kwargs) + result = op.agg() + return result + + agg = aggregate + + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame | Series: + # Validate axis argument + self._get_axis_number(axis) + ser = ( + self.copy(deep=False) + if using_copy_on_write() or warn_copy_on_write() + else self + ) + result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform() + return result + + def apply( + self, + func: AggFuncType, + convert_dtype: bool | lib.NoDefault = lib.no_default, + args: tuple[Any, ...] = (), + *, + by_row: Literal[False, "compat"] = "compat", + **kwargs, + ) -> DataFrame | Series: + """ + Invoke function on values of Series. + + Can be ufunc (a NumPy function that applies to the entire Series) + or a Python function that only works on single values. + + Parameters + ---------- + func : function + Python function or NumPy ufunc to apply. + convert_dtype : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. Note that the dtype is always + preserved for some extension array dtypes, such as Categorical. + + .. deprecated:: 2.1.0 + ``convert_dtype`` has been deprecated. Do ``ser.astype(object).apply()`` + instead if you want ``convert_dtype=False``. + args : tuple + Positional arguments passed to func after the series value. + by_row : False or "compat", default "compat" + If ``"compat"`` and func is a callable, func will be passed each element of + the Series, like ``Series.map``. If func is a list or dict of + callables, will first try to translate each func into pandas methods. If + that doesn't work, will try call to apply again with ``by_row="compat"`` + and if that fails, will call apply again with ``by_row=False`` + (backward compatible). + If False, the func will be passed the whole Series at once. + + ``by_row`` has no effect when ``func`` is a string. + + .. versionadded:: 2.1.0 + **kwargs + Additional keyword arguments passed to func. + + Returns + ------- + Series or DataFrame + If func returns a Series object the result will be a DataFrame. + + See Also + -------- + Series.map: For element-wise operations. + Series.agg: Only perform aggregating type operations. + Series.transform: Only perform transforming type operations. + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + Create a series with typical summer temperatures for each city. + + >>> s = pd.Series([20, 21, 12], + ... index=['London', 'New York', 'Helsinki']) + >>> s + London 20 + New York 21 + Helsinki 12 + dtype: int64 + + Square the values by defining a function and passing it as an + argument to ``apply()``. + + >>> def square(x): + ... return x ** 2 + >>> s.apply(square) + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Square the values by passing an anonymous function as an + argument to ``apply()``. + + >>> s.apply(lambda x: x ** 2) + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Define a custom function that needs additional positional + arguments and pass these additional arguments using the + ``args`` keyword. + + >>> def subtract_custom_value(x, custom_value): + ... return x - custom_value + + >>> s.apply(subtract_custom_value, args=(5,)) + London 15 + New York 16 + Helsinki 7 + dtype: int64 + + Define a custom function that takes keyword arguments + and pass these arguments to ``apply``. + + >>> def add_custom_values(x, **kwargs): + ... for month in kwargs: + ... x += kwargs[month] + ... return x + + >>> s.apply(add_custom_values, june=30, july=20, august=25) + London 95 + New York 96 + Helsinki 87 + dtype: int64 + + Use a function from the Numpy library. + + >>> s.apply(np.log) + London 2.995732 + New York 3.044522 + Helsinki 2.484907 + dtype: float64 + """ + return SeriesApply( + self, + func, + convert_dtype=convert_dtype, + by_row=by_row, + args=args, + kwargs=kwargs, + ).apply() + + def _reindex_indexer( + self, + new_index: Index | None, + indexer: npt.NDArray[np.intp] | None, + copy: bool | None, + ) -> Series: + # Note: new_index is None iff indexer is None + # if not None, indexer is np.intp + if indexer is None and ( + new_index is None or new_index.names == self.index.names + ): + if using_copy_on_write(): + return self.copy(deep=copy) + if copy or copy is None: + return self.copy(deep=copy) + return self + + new_values = algorithms.take_nd( + self._values, indexer, allow_fill=True, fill_value=None + ) + return self._constructor(new_values, index=new_index, copy=False) + + def _needs_reindex_multi(self, axes, method, level) -> bool: + """ + Check if we do need a multi reindex; this is for compat with + higher dims. + """ + return False + + @overload + def rename( + self, + index: Renamer | Hashable | None = ..., + *, + axis: Axis | None = ..., + copy: bool = ..., + inplace: Literal[True], + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def rename( + self, + index: Renamer | Hashable | None = ..., + *, + axis: Axis | None = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> Series: + ... + + @overload + def rename( + self, + index: Renamer | Hashable | None = ..., + *, + axis: Axis | None = ..., + copy: bool = ..., + inplace: bool = ..., + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> Series | None: + ... + + def rename( + self, + index: Renamer | Hashable | None = None, + *, + axis: Axis | None = None, + copy: bool | None = None, + inplace: bool = False, + level: Level | None = None, + errors: IgnoreRaise = "ignore", + ) -> Series | None: + """ + Alter Series index labels or name. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + Alternatively, change ``Series.name`` with a scalar value. + + See the :ref:`user guide ` for more. + + Parameters + ---------- + index : scalar, hashable sequence, dict-like or function optional + Functions or dict-like are transformations to apply to + the index. + Scalar or hashable sequence-like will alter the ``Series.name`` + attribute. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + copy : bool, default True + Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + inplace : bool, default False + Whether to return a new Series. If True the value of copy is ignored. + level : int or level name, default None + In case of MultiIndex, only rename labels in the specified level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise `KeyError` when a `dict-like mapper` or + `index` contains labels that are not present in the index being transformed. + If 'ignore', existing keys will be renamed and extra keys will be ignored. + + Returns + ------- + Series or None + Series with index labels or name altered or None if ``inplace=True``. + + See Also + -------- + DataFrame.rename : Corresponding DataFrame method. + Series.rename_axis : Set the name of the axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.rename("my_name") # scalar, changes Series.name + 0 1 + 1 2 + 2 3 + Name: my_name, dtype: int64 + >>> s.rename(lambda x: x ** 2) # function, changes labels + 0 1 + 1 2 + 4 3 + dtype: int64 + >>> s.rename({1: 3, 2: 5}) # mapping, changes labels + 0 1 + 3 2 + 5 3 + dtype: int64 + """ + if axis is not None: + # Make sure we raise if an invalid 'axis' is passed. + axis = self._get_axis_number(axis) + + if callable(index) or is_dict_like(index): + # error: Argument 1 to "_rename" of "NDFrame" has incompatible + # type "Union[Union[Mapping[Any, Hashable], Callable[[Any], + # Hashable]], Hashable, None]"; expected "Union[Mapping[Any, + # Hashable], Callable[[Any], Hashable], None]" + return super()._rename( + index, # type: ignore[arg-type] + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + else: + return self._set_name(index, inplace=inplace, deep=copy) + + @Appender( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> s.set_axis(['a', 'b', 'c'], axis=0) + a 1 + b 2 + c 3 + dtype: int64 + """ + ) + @Substitution( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + extended_summary_sub="", + axis_description_sub="", + see_also_sub="", + ) + @Appender(NDFrame.set_axis.__doc__) + def set_axis( + self, + labels, + *, + axis: Axis = 0, + copy: bool | None = None, + ) -> Series: + return super().set_axis(labels, axis=axis, copy=copy) + + # error: Cannot determine type of 'reindex' + @doc( + NDFrame.reindex, # type: ignore[has-type] + klass=_shared_doc_kwargs["klass"], + optional_reindex=_shared_doc_kwargs["optional_reindex"], + ) + def reindex( # type: ignore[override] + self, + index=None, + *, + axis: Axis | None = None, + method: ReindexMethod | None = None, + copy: bool | None = None, + level: Level | None = None, + fill_value: Scalar | None = None, + limit: int | None = None, + tolerance=None, + ) -> Series: + return super().reindex( + index=index, + method=method, + copy=copy, + level=level, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + ) + + @overload # type: ignore[override] + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[True], + ) -> None: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + ) -> Self: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: bool = ..., + ) -> Self | None: + ... + + @doc(NDFrame.rename_axis) + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = lib.no_default, + *, + index=lib.no_default, + axis: Axis = 0, + copy: bool = True, + inplace: bool = False, + ) -> Self | None: + return super().rename_axis( + mapper=mapper, + index=index, + axis=axis, + copy=copy, + inplace=inplace, + ) + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: Literal[True], + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: Literal[False] = ..., + errors: IgnoreRaise = ..., + ) -> Series: + ... + + @overload + def drop( + self, + labels: IndexLabel = ..., + *, + axis: Axis = ..., + index: IndexLabel = ..., + columns: IndexLabel = ..., + level: Level | None = ..., + inplace: bool = ..., + errors: IgnoreRaise = ..., + ) -> Series | None: + ... + + def drop( + self, + labels: IndexLabel | None = None, + *, + axis: Axis = 0, + index: IndexLabel | None = None, + columns: IndexLabel | None = None, + level: Level | None = None, + inplace: bool = False, + errors: IgnoreRaise = "raise", + ) -> Series | None: + """ + Return Series with specified index labels removed. + + Remove elements of a Series based on specifying the index labels. + When using a multi-index, labels on different levels can be removed + by specifying the level. + + Parameters + ---------- + labels : single label or list-like + Index labels to drop. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + index : single label or list-like + Redundant for application on Series, but 'index' can be used instead + of 'labels'. + columns : single label or list-like + No change is made to the Series; use 'index' or 'labels' instead. + level : int or level name, optional + For MultiIndex, level for which the labels will be removed. + inplace : bool, default False + If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are dropped. + + Returns + ------- + Series or None + Series with specified index labels removed or None if ``inplace=True``. + + Raises + ------ + KeyError + If none of the labels are found in the index. + + See Also + -------- + Series.reindex : Return only specified index labels of Series. + Series.dropna : Return series without null values. + Series.drop_duplicates : Return Series with duplicate values removed. + DataFrame.drop : Drop specified labels from rows or columns. + + Examples + -------- + >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s + A 0 + B 1 + C 2 + dtype: int64 + + Drop labels B en C + + >>> s.drop(labels=['B', 'C']) + A 0 + dtype: int64 + + Drop 2nd level label in MultiIndex Series + + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], + ... index=midx) + >>> s + llama speed 45.0 + weight 200.0 + length 1.2 + cow speed 30.0 + weight 250.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + dtype: float64 + + >>> s.drop(labels='weight', level=1) + llama speed 45.0 + length 1.2 + cow speed 30.0 + length 1.5 + falcon speed 320.0 + length 0.3 + dtype: float64 + """ + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + def pop(self, item: Hashable) -> Any: + """ + Return item and drops from series. Raise KeyError if not found. + + Parameters + ---------- + item : label + Index of the element that needs to be removed. + + Returns + ------- + Value that is popped from series. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]) + + >>> ser.pop(0) + 1 + + >>> ser + 1 2 + 2 3 + dtype: int64 + """ + return super().pop(item=item) + + @doc(INFO_DOCSTRING, **series_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: IO[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool = True, + ) -> None: + return SeriesInfo(self, memory_usage).render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced + def _replace_single(self, to_replace, method: str, inplace: bool, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ + + result = self if inplace else self.copy() + + values = result._values + mask = missing.mask_missing(values, to_replace) + + if isinstance(values, ExtensionArray): + # dispatch to the EA's _pad_mask_inplace method + values._fill_mask_inplace(method, limit, mask) + else: + fill_f = missing.get_fill_func(method) + fill_f(values, limit=limit, mask=mask) + + if inplace: + return + return result + + def memory_usage(self, index: bool = True, deep: bool = False) -> int: + """ + Return the memory usage of the Series. + + The memory usage can optionally include the contribution of + the index and of elements of `object` dtype. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the Series index. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned value. + + Returns + ------- + int + Bytes of memory consumed. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + DataFrame.memory_usage : Bytes consumed by a DataFrame. + + Examples + -------- + >>> s = pd.Series(range(3)) + >>> s.memory_usage() + 152 + + Not including the index gives the size of the rest of the data, which + is necessarily smaller: + + >>> s.memory_usage(index=False) + 24 + + The memory footprint of `object` values is ignored by default: + + >>> s = pd.Series(["a", "b"]) + >>> s.values + array(['a', 'b'], dtype=object) + >>> s.memory_usage() + 144 + >>> s.memory_usage(deep=True) + 244 + """ + v = self._memory_usage(deep=deep) + if index: + v += self.index.memory_usage(deep=deep) + return v + + def isin(self, values) -> Series: + """ + Whether elements in Series are contained in `values`. + + Return a boolean Series showing whether each element in the Series + matches an element in the passed sequence of `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + Series + Series of booleans indicating if each element is in values. + + Raises + ------ + TypeError + * If `values` is a string + + See Also + -------- + DataFrame.isin : Equivalent method on DataFrame. + + Examples + -------- + >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', + ... 'hippo'], name='animal') + >>> s.isin(['cow', 'llama']) + 0 True + 1 True + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + + To invert the boolean values, use the ``~`` operator: + + >>> ~s.isin(['cow', 'llama']) + 0 False + 1 False + 2 False + 3 True + 4 False + 5 True + Name: animal, dtype: bool + + Passing a single string as ``s.isin('llama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['llama']) + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + + Strings and integers are distinct and are therefore not comparable: + + >>> pd.Series([1]).isin(['1']) + 0 False + dtype: bool + >>> pd.Series([1.1]).isin(['1.1']) + 0 False + dtype: bool + """ + result = algorithms.isin(self._values, values) + return self._constructor(result, index=self.index, copy=False).__finalize__( + self, method="isin" + ) + + def between( + self, + left, + right, + inclusive: Literal["both", "neither", "left", "right"] = "both", + ) -> Series: + """ + Return boolean Series equivalent to left <= series <= right. + + This function returns a boolean vector containing `True` wherever the + corresponding Series element is between the boundary values `left` and + `right`. NA values are treated as `False`. + + Parameters + ---------- + left : scalar or list-like + Left boundary. + right : scalar or list-like + Right boundary. + inclusive : {"both", "neither", "left", "right"} + Include boundaries. Whether to set each bound as closed or open. + + .. versionchanged:: 1.3.0 + + Returns + ------- + Series + Series representing whether each element is between left and + right (inclusive). + + See Also + -------- + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. + + Notes + ----- + This function is equivalent to ``(left <= ser) & (ser <= right)`` + + Examples + -------- + >>> s = pd.Series([2, 0, 4, 8, np.nan]) + + Boundary values are included by default: + + >>> s.between(1, 4) + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + With `inclusive` set to ``"neither"`` boundary values are excluded: + + >>> s.between(1, 4, inclusive="neither") + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + `left` and `right` can be any scalar value: + + >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel') + 0 False + 1 True + 2 True + 3 False + dtype: bool + """ + if inclusive == "both": + lmask = self >= left + rmask = self <= right + elif inclusive == "left": + lmask = self >= left + rmask = self < right + elif inclusive == "right": + lmask = self > left + rmask = self <= right + elif inclusive == "neither": + lmask = self > left + rmask = self < right + else: + raise ValueError( + "Inclusive has to be either string of 'both'," + "'left', 'right', or 'neither'." + ) + + return lmask & rmask + + def case_when( + self, + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], + ) -> Series: + """ + Replace values where the conditions are True. + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + .. versionadded:: 2.2.0 + + Returns + ------- + Series + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name='c') + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement + ... (b.gt(0), b)]) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" + ) + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy() + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = reversed(range(len(conditions))) + for position, condition, replacement in zip( + counter, conditions[::-1], replacements[::-1] + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default + + # error: Cannot determine type of 'isna' + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] + def isna(self) -> Series: + return NDFrame.isna(self) + + # error: Cannot determine type of 'isna' + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] + def isnull(self) -> Series: + """ + Series.isnull is an alias for Series.isna. + """ + return super().isnull() + + # error: Cannot determine type of 'notna' + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] + def notna(self) -> Series: + return super().notna() + + # error: Cannot determine type of 'notna' + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] + def notnull(self) -> Series: + """ + Series.notnull is an alias for Series.notna. + """ + return super().notnull() + + @overload + def dropna( + self, + *, + axis: Axis = ..., + inplace: Literal[False] = ..., + how: AnyAll | None = ..., + ignore_index: bool = ..., + ) -> Series: + ... + + @overload + def dropna( + self, + *, + axis: Axis = ..., + inplace: Literal[True], + how: AnyAll | None = ..., + ignore_index: bool = ..., + ) -> None: + ... + + def dropna( + self, + *, + axis: Axis = 0, + inplace: bool = False, + how: AnyAll | None = None, + ignore_index: bool = False, + ) -> Series | None: + """ + Return a new Series with missing values removed. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + inplace : bool, default False + If True, do operation inplace and return None. + how : str, optional + Not in use. Kept for compatibility. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 2.0.0 + + Returns + ------- + Series or None + Series with NA entries dropped from it or None if ``inplace=True``. + + See Also + -------- + Series.isna: Indicate missing values. + Series.notna : Indicate existing (non-missing) values. + Series.fillna : Replace missing values. + DataFrame.dropna : Drop rows or columns which contain NA values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> ser = pd.Series([1., 2., np.nan]) + >>> ser + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + + Drop NA values from a Series. + + >>> ser.dropna() + 0 1.0 + 1 2.0 + dtype: float64 + + Empty strings are not considered NA values. ``None`` is considered an + NA value. + + >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay']) + >>> ser + 0 NaN + 1 2 + 2 NaT + 3 + 4 None + 5 I stay + dtype: object + >>> ser.dropna() + 1 2 + 3 + 5 I stay + dtype: object + """ + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + # Validate the axis parameter + self._get_axis_number(axis or 0) + + if self._can_hold_na: + result = remove_na_arraylike(self) + else: + if not inplace: + result = self.copy(deep=None) + else: + result = self + + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + return self._update_inplace(result) + else: + return result + + # ---------------------------------------------------------------------- + # Time series-oriented methods + + def to_timestamp( + self, + freq: Frequency | None = None, + how: Literal["s", "e", "start", "end"] = "start", + copy: bool | None = None, + ) -> Series: + """ + Cast to DatetimeIndex of Timestamps, at *beginning* of period. + + Parameters + ---------- + freq : str, default frequency of PeriodIndex + Desired frequency. + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end. + copy : bool, default True + Whether or not to return a copy. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + Series with DatetimeIndex + + Examples + -------- + >>> idx = pd.PeriodIndex(['2023', '2024', '2025'], freq='Y') + >>> s1 = pd.Series([1, 2, 3], index=idx) + >>> s1 + 2023 1 + 2024 2 + 2025 3 + Freq: Y-DEC, dtype: int64 + + The resulting frequency of the Timestamps is `YearBegin` + + >>> s1 = s1.to_timestamp() + >>> s1 + 2023-01-01 1 + 2024-01-01 2 + 2025-01-01 3 + Freq: YS-JAN, dtype: int64 + + Using `freq` which is the offset that the Timestamps will have + + >>> s2 = pd.Series([1, 2, 3], index=idx) + >>> s2 = s2.to_timestamp(freq='M') + >>> s2 + 2023-01-31 1 + 2024-01-31 2 + 2025-01-31 3 + Freq: YE-JAN, dtype: int64 + """ + if not isinstance(self.index, PeriodIndex): + raise TypeError(f"unsupported Type {type(self.index).__name__}") + + new_obj = self.copy(deep=copy and not using_copy_on_write()) + new_index = self.index.to_timestamp(freq=freq, how=how) + setattr(new_obj, "index", new_index) + return new_obj + + def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series: + """ + Convert Series from DatetimeIndex to PeriodIndex. + + Parameters + ---------- + freq : str, default None + Frequency associated with the PeriodIndex. + copy : bool, default True + Whether or not to return a copy. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + Series + Series with index converted to PeriodIndex. + + Examples + -------- + >>> idx = pd.DatetimeIndex(['2023', '2024', '2025']) + >>> s = pd.Series([1, 2, 3], index=idx) + >>> s = s.to_period() + >>> s + 2023 1 + 2024 2 + 2025 3 + Freq: Y-DEC, dtype: int64 + + Viewing the index + + >>> s.index + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') + """ + if not isinstance(self.index, DatetimeIndex): + raise TypeError(f"unsupported Type {type(self.index).__name__}") + + new_obj = self.copy(deep=copy and not using_copy_on_write()) + new_index = self.index.to_period(freq=freq) + setattr(new_obj, "index", new_index) + return new_obj + + # ---------------------------------------------------------------------- + # Add index + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index"] + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[0] = 0 + _info_axis_name: Literal["index"] = "index" + + index = properties.AxisProperty( + axis=0, + doc=""" + The index (axis labels) of the Series. + + The index of a Series is used to label and identify each element of the + underlying data. The index can be thought of as an immutable ordered set + (technically a multi-set, as it may contain duplicate labels), and is + used to index and align data in pandas. + + Returns + ------- + Index + The index labels of the Series. + + See Also + -------- + Series.reindex : Conform Series to new index. + Index : The base pandas index type. + + Notes + ----- + For more information on pandas indexing, see the `indexing user guide + `__. + + Examples + -------- + To create a Series with a custom index and view the index labels: + + >>> cities = ['Kolkata', 'Chicago', 'Toronto', 'Lisbon'] + >>> populations = [14.85, 2.71, 2.93, 0.51] + >>> city_series = pd.Series(populations, index=cities) + >>> city_series.index + Index(['Kolkata', 'Chicago', 'Toronto', 'Lisbon'], dtype='object') + + To change the index labels of an existing Series: + + >>> city_series.index = ['KOL', 'CHI', 'TOR', 'LIS'] + >>> city_series.index + Index(['KOL', 'CHI', 'TOR', 'LIS'], dtype='object') + """, + ) + + # ---------------------------------------------------------------------- + # Accessor Methods + # ---------------------------------------------------------------------- + str = CachedAccessor("str", StringMethods) + dt = CachedAccessor("dt", CombinedDatetimelikeProperties) + cat = CachedAccessor("cat", CategoricalAccessor) + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + sparse = CachedAccessor("sparse", SparseAccessor) + struct = CachedAccessor("struct", StructAccessor) + list = CachedAccessor("list", ListAccessor) + + # ---------------------------------------------------------------------- + # Add plotting methods to Series + hist = pandas.plotting.hist_series + + # ---------------------------------------------------------------------- + # Template-Based Arithmetic/Comparison Methods + + def _cmp_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + + if isinstance(other, Series) and not self._indexed_same(other): + raise ValueError("Can only compare identically-labeled Series objects") + + lvalues = self._values + rvalues = extract_array(other, extract_numpy=True, extract_range=True) + + res_values = ops.comparison_op(lvalues, rvalues, op) + + return self._construct_result(res_values, name=res_name) + + def _logical_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + self, other = self._align_for_op(other, align_asobject=True) + + lvalues = self._values + rvalues = extract_array(other, extract_numpy=True, extract_range=True) + + res_values = ops.logical_op(lvalues, rvalues, op) + return self._construct_result(res_values, name=res_name) + + def _arith_method(self, other, op): + self, other = self._align_for_op(other) + return base.IndexOpsMixin._arith_method(self, other, op) + + def _align_for_op(self, right, align_asobject: bool = False): + """align lhs and rhs Series""" + # TODO: Different from DataFrame._align_for_op, list, tuple and ndarray + # are not coerced here + # because Series has inconsistencies described in GH#13637 + left = self + + if isinstance(right, Series): + # avoid repeated alignment + if not left.index.equals(right.index): + if align_asobject: + if left.dtype not in (object, np.bool_) or right.dtype not in ( + object, + np.bool_, + ): + warnings.warn( + "Operation between non boolean Series with different " + "indexes will no longer return a boolean result in " + "a future version. Cast both Series to object type " + "to maintain the prior behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) + + left, right = left.align(right, copy=False) + + return left, right + + def _binop(self, other: Series, func, level=None, fill_value=None) -> Series: + """ + Perform generic binary operation with optional fill value. + + Parameters + ---------- + other : Series + func : binary operator + fill_value : float or object + Value to substitute for NA/null values. If both Series are NA in a + location, the result will be NA regardless of the passed fill value. + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level. + + Returns + ------- + Series + """ + this = self + + if not self.index.equals(other.index): + this, other = self.align(other, level=level, join="outer", copy=False) + + this_vals, other_vals = ops.fill_binop(this._values, other._values, fill_value) + + with np.errstate(all="ignore"): + result = func(this_vals, other_vals) + + name = ops.get_op_result_name(self, other) + out = this._construct_result(result, name) + return cast(Series, out) + + def _construct_result( + self, result: ArrayLike | tuple[ArrayLike, ArrayLike], name: Hashable + ) -> Series | tuple[Series, Series]: + """ + Construct an appropriately-labelled Series from the result of an op. + + Parameters + ---------- + result : ndarray or ExtensionArray + name : Label + + Returns + ------- + Series + In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. + """ + if isinstance(result, tuple): + # produced by divmod or rdivmod + + res1 = self._construct_result(result[0], name=name) + res2 = self._construct_result(result[1], name=name) + + # GH#33427 assertions to keep mypy happy + assert isinstance(res1, Series) + assert isinstance(res2, Series) + return (res1, res2) + + # TODO: result should always be ArrayLike, but this fails for some + # JSONArray tests + dtype = getattr(result, "dtype", None) + out = self._constructor(result, index=self.index, dtype=dtype, copy=False) + out = out.__finalize__(self) + + # Set the result's name after __finalize__ is called because __finalize__ + # would set it back to self.name + out.name = name + return out + + def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0): + if axis is not None: + self._get_axis_number(axis) + + res_name = ops.get_op_result_name(self, other) + + if isinstance(other, Series): + return self._binop(other, op, level=level, fill_value=fill_value) + elif isinstance(other, (np.ndarray, list, tuple)): + if len(other) != len(self): + raise ValueError("Lengths must be equal") + other = self._constructor(other, self.index, copy=False) + result = self._binop(other, op, level=level, fill_value=fill_value) + result._name = res_name + return result + else: + if fill_value is not None: + if isna(other): + return op(self, fill_value) + self = self.fillna(fill_value) + + return op(self, other) + + @Appender(ops.make_flex_doc("eq", "series")) + def eq( + self, + other, + level: Level | None = None, + fill_value: float | None = None, + axis: Axis = 0, + ) -> Series: + return self._flex_method( + other, operator.eq, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("ne", "series")) + def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.ne, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("le", "series")) + def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.le, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("lt", "series")) + def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.lt, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("ge", "series")) + def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.ge, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("gt", "series")) + def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.gt, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("add", "series")) + def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.add, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("radd", "series")) + def radd(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.radd, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("sub", "series")) + def sub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.sub, level=level, fill_value=fill_value, axis=axis + ) + + subtract = sub + + @Appender(ops.make_flex_doc("rsub", "series")) + def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rsub, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("mul", "series")) + def mul( + self, + other, + level: Level | None = None, + fill_value: float | None = None, + axis: Axis = 0, + ) -> Series: + return self._flex_method( + other, operator.mul, level=level, fill_value=fill_value, axis=axis + ) + + multiply = mul + + @Appender(ops.make_flex_doc("rmul", "series")) + def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rmul, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("truediv", "series")) + def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.truediv, level=level, fill_value=fill_value, axis=axis + ) + + div = truediv + divide = truediv + + @Appender(ops.make_flex_doc("rtruediv", "series")) + def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis + ) + + rdiv = rtruediv + + @Appender(ops.make_flex_doc("floordiv", "series")) + def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.floordiv, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rfloordiv", "series")) + def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("mod", "series")) + def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.mod, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rmod", "series")) + def rmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rmod, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("pow", "series")) + def pow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, operator.pow, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rpow", "series")) + def rpow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rpow, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("divmod", "series")) + def divmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, divmod, level=level, fill_value=fill_value, axis=axis + ) + + @Appender(ops.make_flex_doc("rdivmod", "series")) + def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + return self._flex_method( + other, roperator.rdivmod, level=level, fill_value=fill_value, axis=axis + ) + + # ---------------------------------------------------------------------- + # Reductions + + def _reduce( + self, + op, + # error: Variable "pandas.core.series.Series.str" is not valid as a type + name: str, # type: ignore[valid-type] + *, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + filter_type=None, + **kwds, + ): + """ + Perform a reduction operation. + + If we have an ndarray as a value, then simply perform the operation, + otherwise delegate to the object. + """ + delegate = self._values + + if axis is not None: + self._get_axis_number(axis) + + if isinstance(delegate, ExtensionArray): + # dispatch to ExtensionArray interface + return delegate._reduce(name, skipna=skipna, **kwds) + + else: + # dispatch to numpy arrays + if numeric_only and self.dtype.kind not in "iufcb": + # i.e. not is_numeric_dtype(self.dtype) + kwd_name = "numeric_only" + if name in ["any", "all"]: + kwd_name = "bool_only" + # GH#47500 - change to TypeError to match other methods + raise TypeError( + f"Series.{name} does not allow {kwd_name}={numeric_only} " + "with non-numeric dtypes." + ) + return op(delegate, skipna=skipna, **kwds) + + @Appender(make_doc("any", ndim=1)) + # error: Signature of "any" incompatible with supertype "NDFrame" + def any( # type: ignore[override] + self, + *, + axis: Axis = 0, + bool_only: bool = False, + skipna: bool = True, + **kwargs, + ) -> bool: + nv.validate_logical_func((), kwargs, fname="any") + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + return self._reduce( + nanops.nanany, + name="any", + axis=axis, + numeric_only=bool_only, + skipna=skipna, + filter_type="bool", + ) + + @Appender(make_doc("all", ndim=1)) + def all( + self, + axis: Axis = 0, + bool_only: bool = False, + skipna: bool = True, + **kwargs, + ) -> bool: + nv.validate_logical_func((), kwargs, fname="all") + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + return self._reduce( + nanops.nanall, + name="all", + axis=axis, + numeric_only=bool_only, + skipna=skipna, + filter_type="bool", + ) + + @doc(make_doc("min", ndim=1)) + def min( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.min(self, axis, skipna, numeric_only, **kwargs) + + @doc(make_doc("max", ndim=1)) + def max( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.max(self, axis, skipna, numeric_only, **kwargs) + + @doc(make_doc("sum", ndim=1)) + def sum( + self, + axis: Axis | None = None, + skipna: bool = True, + numeric_only: bool = False, + min_count: int = 0, + **kwargs, + ): + return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs) + + @doc(make_doc("prod", ndim=1)) + def prod( + self, + axis: Axis | None = None, + skipna: bool = True, + numeric_only: bool = False, + min_count: int = 0, + **kwargs, + ): + return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs) + + @doc(make_doc("mean", ndim=1)) + def mean( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs) + + @doc(make_doc("median", ndim=1)) + def median( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.median(self, axis, skipna, numeric_only, **kwargs) + + @doc(make_doc("sem", ndim=1)) + def sem( + self, + axis: Axis | None = None, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs) + + @doc(make_doc("var", ndim=1)) + def var( + self, + axis: Axis | None = None, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs) + + @doc(make_doc("std", ndim=1)) + def std( + self, + axis: Axis | None = None, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs) + + @doc(make_doc("skew", ndim=1)) + def skew( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs) + + @doc(make_doc("kurt", ndim=1)) + def kurt( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ): + return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs) + + kurtosis = kurt + product = prod + + @doc(make_doc("cummin", ndim=1)) + def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummin(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cummax", ndim=1)) + def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cummax(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumsum", ndim=1)) + def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) + + @doc(make_doc("cumprod", 1)) + def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/shared_docs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/shared_docs.py new file mode 100644 index 0000000000000000000000000000000000000000..25f7e7e9f832b0142af32acbb9690d0ae3f0b5e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/shared_docs.py @@ -0,0 +1,952 @@ +from __future__ import annotations + +_shared_docs: dict[str, str] = {} + +_shared_docs[ + "aggregate" +] = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions +{see_also} +Notes +----- +The aggregation operations are always performed over an axis, either the +index (default) or the column axis. This behavior is different from +`numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, +`var`), where the default is to compute the aggregation of the flattened +array, e.g., ``numpy.mean(arr_2d)`` as opposed to +``numpy.mean(arr_2d, axis=0)``. + +`agg` is an alias for `aggregate`. Use the alias. + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +A passed user-defined-function will be passed a Series for evaluation. +{examples}""" + +_shared_docs[ + "compare" +] = """ +Compare to another {klass} and show the differences. + +Parameters +---------- +other : {klass} + Object to compare with. + +align_axis : {{0 or 'index', 1 or 'columns'}}, default 1 + Determine which axis to align the comparison on. + + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are aligned horizontally + with columns drawn alternately from self and other. + +keep_shape : bool, default False + If true, all rows and columns are kept. + Otherwise, only the ones with different values are kept. + +keep_equal : bool, default False + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. + +result_names : tuple, default ('self', 'other') + Set the dataframes names in the comparison. + + .. versionadded:: 1.5.0 +""" + +_shared_docs[ + "groupby" +] = """ +Group %(klass)s using a mapper or by a Series of columns. + +A groupby operation involves some combination of splitting the +object, applying a function, and combining the results. This can be +used to group large amounts of data and compute operations on these +groups. + +Parameters +---------- +by : mapping, function, label, pd.Grouper or list of such + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If a list or ndarray of length + equal to the selected axis is passed (see the `groupby user guide + `_), + the values are used as-is to determine the groups. A label or list + of labels may be passed to group by the columns in ``self``. + Notice that a tuple is interpreted as a (single) key. +axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). For `Series` this parameter + is unused and defaults to 0. + + .. deprecated:: 2.1.0 + + Will be removed and behave like axis=0 in a future version. + For ``axis=1``, do ``frame.T.groupby(...)`` instead. + +level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. Do not specify both ``by`` and ``level``. +as_index : bool, default True + Return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. This argument has no effect + on filtrations (see the `filtrations in the user guide + `_), + such as ``head()``, ``tail()``, ``nth()`` and in transformations + (see the `transformations in the user guide + `_). +sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. If False, + the groups will appear in the same order as they did in the original DataFrame. + This argument has no effect on filtrations (see the `filtrations in the user guide + `_), + such as ``head()``, ``tail()``, ``nth()`` and in transformations + (see the `transformations in the user guide + `_). + + .. versionchanged:: 2.0.0 + + Specifying ``sort=False`` with an ordered categorical grouper will no + longer sort the values. + +group_keys : bool, default True + When calling apply and the ``by`` argument produces a like-indexed + (i.e. :ref:`a transform `) result, add group keys to + index to identify pieces. By default group keys are not included + when the result's index (and column) labels match the inputs, and + are included otherwise. + + .. versionchanged:: 1.5.0 + + Warns that ``group_keys`` will no longer be ignored when the + result from ``apply`` is a like-indexed Series or DataFrame. + Specify ``group_keys`` explicitly to include the group keys or + not. + + .. versionchanged:: 2.0.0 + + ``group_keys`` now defaults to ``True``. + +observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. deprecated:: 2.1.0 + + The default value will change to True in a future version of pandas. + +dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups. + +Returns +------- +pandas.api.typing.%(klass)sGroupBy + Returns a groupby object that contains information about the groups. + +See Also +-------- +resample : Convenience method for frequency conversion and resampling + of time series. + +Notes +----- +See the `user guide +`__ for more +detailed usage and examples, including splitting an object into groups, +iterating through groups, selecting a group, aggregation, and more. +""" + +_shared_docs[ + "melt" +] = """ +Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + +This function is useful to massage a DataFrame into a format where one +or more columns are identifier variables (`id_vars`), while all other +columns, considered measured variables (`value_vars`), are "unpivoted" to +the row axis, leaving just two non-identifier columns, 'variable' and +'value'. + +Parameters +---------- +id_vars : scalar, tuple, list, or ndarray, optional + Column(s) to use as identifier variables. +value_vars : scalar, tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. +var_name : scalar, default None + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. +value_name : scalar, default 'value' + Name to use for the 'value' column, can't be an existing column label. +col_level : scalar, optional + If columns are a MultiIndex then use this level to melt. +ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + +Returns +------- +DataFrame + Unpivoted DataFrame. + +See Also +-------- +%(other)s : Identical method. +pivot_table : Create a spreadsheet-style pivot table as a DataFrame. +DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. +DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + +Notes +----- +Reference :ref:`the user guide ` for more examples. + +Examples +-------- +>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, +... 'B': {0: 1, 1: 3, 2: 5}, +... 'C': {0: 2, 1: 4, 2: 6}}) +>>> df + A B C +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +3 a C 2 +4 b C 4 +5 c C 6 + +The names of 'variable' and 'value' columns can be customized: + +>>> %(caller)sid_vars=['A'], value_vars=['B'], +... var_name='myVarname', value_name='myValname') + A myVarname myValname +0 a B 1 +1 b B 3 +2 c B 5 + +Original index values can be kept around: + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +0 a C 2 +1 b C 4 +2 c C 6 + +If you have multi-index columns: + +>>> df.columns = [list('ABC'), list('DEF')] +>>> df + A B C + D E F +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value +0 a B E 1 +1 b B E 3 +2 c B E 5 +""" + +_shared_docs[ + "transform" +] = """ +Call ``func`` on self producing a {klass} with the same axis shape as self. + +Parameters +---------- +func : function, str, list-like or dict-like + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. If func + is both list-like and dict-like, dict-like behavior takes precedence. + + Accepted combinations are: + + - function + - string function name + - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` + - dict-like of axis labels -> functions, function names or list-like of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +{klass} + A {klass} that must have the same length as self. + +Raises +------ +ValueError : If the returned {klass} has a different length than self. + +See Also +-------- +{klass}.agg : Only perform aggregating type operations. +{klass}.apply : Invoke function on a {klass}. + +Notes +----- +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +Examples +-------- +>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) +>>> df + A B +0 0 1 +1 1 2 +2 2 3 +>>> df.transform(lambda x: x + 1) + A B +0 1 2 +1 2 3 +2 3 4 + +Even though the resulting {klass} must have the same length as the +input {klass}, it is possible to provide several input functions: + +>>> s = pd.Series(range(3)) +>>> s +0 0 +1 1 +2 2 +dtype: int64 +>>> s.transform([np.sqrt, np.exp]) + sqrt exp +0 0.000000 1.000000 +1 1.000000 2.718282 +2 1.414214 7.389056 + +You can call transform on a GroupBy object: + +>>> df = pd.DataFrame({{ +... "Date": [ +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05", +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"], +... "Data": [5, 8, 6, 1, 50, 100, 60, 120], +... }}) +>>> df + Date Data +0 2015-05-08 5 +1 2015-05-07 8 +2 2015-05-06 6 +3 2015-05-05 1 +4 2015-05-08 50 +5 2015-05-07 100 +6 2015-05-06 60 +7 2015-05-05 120 +>>> df.groupby('Date')['Data'].transform('sum') +0 55 +1 108 +2 66 +3 121 +4 55 +5 108 +6 66 +7 121 +Name: Data, dtype: int64 + +>>> df = pd.DataFrame({{ +... "c": [1, 1, 1, 2, 2, 2, 2], +... "type": ["m", "n", "o", "m", "m", "n", "n"] +... }}) +>>> df + c type +0 1 m +1 1 n +2 1 o +3 2 m +4 2 m +5 2 n +6 2 n +>>> df['size'] = df.groupby('c')['type'].transform(len) +>>> df + c type size +0 1 m 3 +1 1 n 3 +2 1 o 3 +3 2 m 4 +4 2 m 4 +5 2 n 4 +6 2 n 4 +""" + +_shared_docs[ + "storage_options" +] = """storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_.""" + +_shared_docs[ + "compression_options" +] = """compression : str or dict, default 'infer' + For on-the-fly compression of the output data. If 'infer' and '%s' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + Set to ``None`` for no compression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for faster compression and to create + a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files.""" + +_shared_docs[ + "decompression_options" +] = """compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and '%s' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files.""" + +_shared_docs[ + "replace" +] = """ + Replace values given in `to_replace` with `value`. + + Values of the {klass} are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + Parameters + ---------- + to_replace : str, regex, list, dict, Series, int, float, or None + How to find the values that will be replaced. + + * numeric, str or regex: + + - numeric: numeric values equal to `to_replace` will be + replaced with `value` + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str, regex and numeric rules apply as above. + + * dict: + + - Dicts can be used to specify different replacement values + for different existing values. For example, + ``{{'a': 'b', 'y': 'z'}}`` replaces the value 'a' with 'b' and + 'y' with 'z'. To use a dict in this way, the optional `value` + parameter should not be given. + - For a DataFrame a dict can specify that different values + should be replaced in different columns. For example, + ``{{'a': 1, 'b': 'z'}}`` looks for the value 1 in column 'a' + and the value 'z' in column 'b' and replaces these values + with whatever is specified in `value`. The `value` parameter + should not be ``None`` in this case. You can treat this as a + special case of passing two lists except that you are + specifying the column to search in. + - For a DataFrame nested dictionaries, e.g., + ``{{'a': {{'b': np.nan}}}}``, are read as follows: look in column + 'a' for the value 'b' and replace it with NaN. The optional `value` + parameter should not be specified to use a nested dict in this + way. You can nest regular expressions as well. Note that + column names (the top-level dictionary keys in a nested + dictionary) **cannot** be regular expressions. + + * None: + + - This means that the `regex` argument must be a string, + compiled regular expression, or list, dict, ndarray or + Series of such elements. If `value` is also ``None`` then + this **must** be a nested dictionary or Series. + + See the examples section for examples of each of these. + value : scalar, dict, list, str, regex, default None + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + {inplace} + limit : int, default None + Maximum size gap to forward or backward fill. + + .. deprecated:: 2.1.0 + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + `to_replace` must be ``None``. + method : {{'pad', 'ffill', 'bfill'}} + The method to use when for replacement, when `to_replace` is a + scalar, list or tuple and `value` is ``None``. + + .. deprecated:: 2.1.0 + + Returns + ------- + {klass} + Object after replacement. + + Raises + ------ + AssertionError + * If `regex` is not a ``bool`` and `to_replace` is not + ``None``. + + TypeError + * If `to_replace` is not a scalar, array-like, ``dict``, or ``None`` + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to `to_replace` does not match the type of the + value being replaced + + ValueError + * If a ``list`` or an ``ndarray`` is passed to `to_replace` and + `value` but they are not the same length. + + See Also + -------- + Series.fillna : Fill NA values. + DataFrame.fillna : Fill NA values. + Series.where : Replace values based on boolean condition. + DataFrame.where : Replace values based on boolean condition. + DataFrame.map: Apply a function to a Dataframe elementwise. + Series.map: Map values of Series according to an input mapping or function. + Series.str.replace : Simple string replacement. + + Notes + ----- + * Regex substitution is performed under the hood with ``re.sub``. The + rules for substitution for ``re.sub`` are the same. + * Regular expressions will only substitute on strings, meaning you + cannot provide, for example, a regular expression matching floating + point numbers and expect the columns in your frame that have a + numeric dtype to be matched. However, if those floating point + numbers *are* strings, then you can do this. + * This method has *a lot* of options. You are encouraged to experiment + and play with this method to gain intuition about how it works. + * When dict is used as the `to_replace` value, it is like + key(s) in the dict are the to_replace part and + value(s) in the dict are the value parameter. + + Examples + -------- + + **Scalar `to_replace` and `value`** + + >>> s = pd.Series([1, 2, 3, 4, 5]) + >>> s.replace(1, 5) + 0 5 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: int64 + + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], + ... 'B': [5, 6, 7, 8, 9], + ... 'C': ['a', 'b', 'c', 'd', 'e']}}) + >>> df.replace(0, 5) + A B C + 0 5 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + **List-like `to_replace`** + + >>> df.replace([0, 1, 2, 3], 4) + A B C + 0 4 5 a + 1 4 6 b + 2 4 7 c + 3 4 8 d + 4 4 9 e + + >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) + A B C + 0 4 5 a + 1 3 6 b + 2 2 7 c + 3 1 8 d + 4 4 9 e + + >>> s.replace([1, 2], method='bfill') + 0 3 + 1 3 + 2 3 + 3 4 + 4 5 + dtype: int64 + + **dict-like `to_replace`** + + >>> df.replace({{0: 10, 1: 100}}) + A B C + 0 10 5 a + 1 100 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace({{'A': 0, 'B': 5}}, 100) + A B C + 0 100 100 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace({{'A': {{0: 100, 4: 400}}}}) + A B C + 0 100 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 400 9 e + + **Regular expression `to_replace`** + + >>> df = pd.DataFrame({{'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}}) + >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) + A B + 0 new abc + 1 foo new + 2 bait xyz + + >>> df.replace({{'A': r'^ba.$'}}, {{'A': 'new'}}, regex=True) + A B + 0 new abc + 1 foo bar + 2 bait xyz + + >>> df.replace(regex=r'^ba.$', value='new') + A B + 0 new abc + 1 foo new + 2 bait xyz + + >>> df.replace(regex={{r'^ba.$': 'new', 'foo': 'xyz'}}) + A B + 0 new abc + 1 xyz new + 2 bait xyz + + >>> df.replace(regex=[r'^ba.$', 'foo'], value='new') + A B + 0 new abc + 1 new new + 2 bait xyz + + Compare the behavior of ``s.replace({{'a': None}})`` and + ``s.replace('a', None)`` to understand the peculiarities + of the `to_replace` parameter: + + >>> s = pd.Series([10, 'a', 'a', 'b', 'a']) + + When one uses a dict as the `to_replace` value, it is like the + value(s) in the dict are equal to the `value` parameter. + ``s.replace({{'a': None}})`` is equivalent to + ``s.replace(to_replace={{'a': None}}, value=None, method=None)``: + + >>> s.replace({{'a': None}}) + 0 10 + 1 None + 2 None + 3 b + 4 None + dtype: object + + When ``value`` is not explicitly passed and `to_replace` is a scalar, list + or tuple, `replace` uses the method parameter (default 'pad') to do the + replacement. So this is why the 'a' values are being replaced by 10 + in rows 1 and 2 and 'b' in row 4 in this case. + + >>> s.replace('a') + 0 10 + 1 10 + 2 10 + 3 b + 4 b + dtype: object + + .. deprecated:: 2.1.0 + The 'method' parameter and padding behavior are deprecated. + + On the other hand, if ``None`` is explicitly passed for ``value``, it will + be respected: + + >>> s.replace('a', None) + 0 10 + 1 None + 2 None + 3 b + 4 None + dtype: object + + .. versionchanged:: 1.4.0 + Previously the explicit ``None`` was silently ignored. + + When ``regex=True``, ``value`` is not ``None`` and `to_replace` is a string, + the replacement will be applied in all columns of the DataFrame. + + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], + ... 'B': ['a', 'b', 'c', 'd', 'e'], + ... 'C': ['f', 'g', 'h', 'i', 'j']}}) + + >>> df.replace(to_replace='^[a-g]', value='e', regex=True) + A B C + 0 0 e e + 1 1 e e + 2 2 e h + 3 3 e i + 4 4 e j + + If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary + keys will be the DataFrame columns that the replacement will be applied. + + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True) + A B C + 0 0 e f + 1 1 e g + 2 2 e e + 3 3 d e + 4 4 e e +""" + +_shared_docs[ + "idxmin" +] = """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object +""" + +_shared_docs[ + "idxmax" +] = """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sorting.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..a431842218b3bbb97cb5373e03d2a31a4f69ca10 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sorting.py @@ -0,0 +1,748 @@ +""" miscellaneous sorting / groupby utilities """ +from __future__ import annotations + +from collections import defaultdict +from typing import ( + TYPE_CHECKING, + Callable, + DefaultDict, + cast, +) + +import numpy as np + +from pandas._libs import ( + algos, + hashtable, + lib, +) +from pandas._libs.hashtable import unique_label_indices + +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, +) +from pandas.core.dtypes.generic import ( + ABCMultiIndex, + ABCRangeIndex, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + AxisInt, + IndexKeyFunc, + Level, + NaPosition, + Shape, + SortKind, + npt, + ) + + from pandas import ( + MultiIndex, + Series, + ) + from pandas.core.arrays import ExtensionArray + from pandas.core.indexes.base import Index + + +def get_indexer_indexer( + target: Index, + level: Level | list[Level] | None, + ascending: list[bool] | bool, + kind: SortKind, + na_position: NaPosition, + sort_remaining: bool, + key: IndexKeyFunc, +) -> npt.NDArray[np.intp] | None: + """ + Helper method that return the indexer according to input parameters for + the sort_index method of DataFrame and Series. + + Parameters + ---------- + target : Index + level : int or level name or list of ints or list of level names + ascending : bool or list of bools, default True + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'} + na_position : {'first', 'last'} + sort_remaining : bool + key : callable, optional + + Returns + ------- + Optional[ndarray[intp]] + The indexer for the new index. + """ + + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has + # type "Index") + target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] + target = target._sort_levels_monotonic() + + if level is not None: + _, indexer = target.sortlevel( + level, + ascending=ascending, + sort_remaining=sort_remaining, + na_position=na_position, + ) + elif (np.all(ascending) and target.is_monotonic_increasing) or ( + not np.any(ascending) and target.is_monotonic_decreasing + ): + # Check monotonic-ness before sort an index (GH 11080) + return None + elif isinstance(target, ABCMultiIndex): + codes = [lev.codes for lev in target._get_codes_for_sorting()] + indexer = lexsort_indexer( + codes, orders=ascending, na_position=na_position, codes_given=True + ) + else: + # ascending can only be a Sequence for MultiIndex + indexer = nargsort( + target, + kind=kind, + ascending=cast(bool, ascending), + na_position=na_position, + ) + return indexer + + +def get_group_index( + labels, shape: Shape, sort: bool, xnull: bool +) -> npt.NDArray[np.int64]: + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations, *as long as* this space fits within int64 bounds; + otherwise, though group indices identify unique combinations of + labels, they cannot be deconstructed. + - If `sort`, rank of returned ids preserve lexical ranks of labels. + i.e. returned id's can be used to do lexical sort on labels; + - If `xnull` nulls (-1 labels) are passed through. + + Parameters + ---------- + labels : sequence of arrays + Integers identifying levels at each location + shape : tuple[int, ...] + Number of unique levels at each location + sort : bool + If the ranks of returned ids should match lexical ranks of labels + xnull : bool + If true nulls are excluded. i.e. -1 values in the labels are + passed through. + + Returns + ------- + An array of type int64 where two elements are equal if their corresponding + labels are equal at all location. + + Notes + ----- + The length of `labels` and `shape` must be identical. + """ + + def _int64_cut_off(shape) -> int: + acc = 1 + for i, mul in enumerate(shape): + acc *= int(mul) + if not acc < lib.i8max: + return i + return len(shape) + + def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]: + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = [ensure_int64(x) for x in labels] + lshape = list(shape) + if not xnull: + for i, (lab, size) in enumerate(zip(labels, shape)): + labels[i], lshape[i] = maybe_lift(lab, size) + + labels = list(labels) + + # Iteratively process all the labels in chunks sized so less + # than lib.i8max unique int ids will be required for each chunk + while True: + # how many levels can be done without overflow: + nlev = _int64_cut_off(lshape) + + # compute flat ids for the first `nlev` levels + stride = np.prod(lshape[1:nlev], dtype="i8") + out = stride * labels[0].astype("i8", subok=False, copy=False) + + for i in range(1, nlev): + if lshape[i] == 0: + stride = np.int64(0) + else: + stride //= lshape[i] + out += labels[i] * stride + + if xnull: # exclude nulls + mask = labels[0] == -1 + for lab in labels[1:nlev]: + mask |= lab == -1 + out[mask] = -1 + + if nlev == len(lshape): # all levels done! + break + + # compress what has been done so far in order to avoid overflow + # to retain lexical ranks, obs_ids should be sorted + comp_ids, obs_ids = compress_group_index(out, sort=sort) + + labels = [comp_ids] + labels[nlev:] + lshape = [len(obs_ids)] + lshape[nlev:] + + return out + + +def get_compressed_ids( + labels, sizes: Shape +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + + Parameters + ---------- + labels : list of label arrays + sizes : tuple[int] of size of the levels + + Returns + ------- + np.ndarray[np.intp] + comp_ids + np.ndarray[np.int64] + obs_group_ids + """ + ids = get_group_index(labels, sizes, sort=True, xnull=False) + return compress_group_index(ids, sort=True) + + +def is_int64_overflow_possible(shape: Shape) -> bool: + the_prod = 1 + for x in shape: + the_prod *= int(x) + + return the_prod >= lib.i8max + + +def _decons_group_index( + comp_labels: npt.NDArray[np.intp], shape: Shape +) -> list[npt.NDArray[np.intp]]: + # reconstruct labels + if is_int64_overflow_possible(shape): + # at some point group indices are factorized, + # and may not be deconstructed here! wrong path! + raise ValueError("cannot deconstruct factorized group indices!") + + label_list = [] + factor = 1 + y = np.array(0) + x = comp_labels + for i in reversed(range(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def decons_obs_group_ids( + comp_ids: npt.NDArray[np.intp], + obs_ids: npt.NDArray[np.intp], + shape: Shape, + labels: Sequence[npt.NDArray[np.signedinteger]], + xnull: bool, +) -> list[npt.NDArray[np.intp]]: + """ + Reconstruct labels from observed group ids. + + Parameters + ---------- + comp_ids : np.ndarray[np.intp] + obs_ids: np.ndarray[np.intp] + shape : tuple[int] + labels : Sequence[np.ndarray[np.signedinteger]] + xnull : bool + If nulls are excluded; i.e. -1 labels are passed through. + """ + if not xnull: + lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp) + arr_shape = np.asarray(shape, dtype=np.intp) + lift + shape = tuple(arr_shape) + + if not is_int64_overflow_possible(shape): + # obs ids are deconstructable! take the fast route! + out = _decons_group_index(obs_ids, shape) + return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] + + indexer = unique_label_indices(comp_ids) + return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] + + +def lexsort_indexer( + keys: Sequence[ArrayLike | Index | Series], + orders=None, + na_position: str = "last", + key: Callable | None = None, + codes_given: bool = False, +) -> npt.NDArray[np.intp]: + """ + Performs lexical sorting on a set of keys + + Parameters + ---------- + keys : Sequence[ArrayLike | Index | Series] + Sequence of arrays to be sorted by the indexer + Sequence[Series] is only if key is not None. + orders : bool or list of booleans, optional + Determines the sorting order for each element in keys. If a list, + it must be the same length as keys. This determines whether the + corresponding element in keys should be sorted in ascending + (True) or descending (False) order. if bool, applied to all + elements as above. if None, defaults to True. + na_position : {'first', 'last'}, default 'last' + Determines placement of NA elements in the sorted list ("last" or "first") + key : Callable, optional + Callable key function applied to every element in keys before sorting + codes_given: bool, False + Avoid categorical materialization if codes are already provided. + + Returns + ------- + np.ndarray[np.intp] + """ + from pandas.core.arrays import Categorical + + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {na_position}") + + if isinstance(orders, bool): + orders = [orders] * len(keys) + elif orders is None: + orders = [True] * len(keys) + + labels = [] + + for k, order in zip(keys, orders): + k = ensure_key_mapped(k, key) + if codes_given: + codes = cast(np.ndarray, k) + n = codes.max() + 1 if len(codes) else 0 + else: + cat = Categorical(k, ordered=True) + codes = cat.codes + n = len(cat.categories) + + mask = codes == -1 + + if na_position == "last" and mask.any(): + codes = np.where(mask, n, codes) + + # not order means descending + if not order: + codes = np.where(mask, codes, n - codes - 1) + + labels.append(codes) + + return np.lexsort(labels[::-1]) + + +def nargsort( + items: ArrayLike | Index | Series, + kind: SortKind = "quicksort", + ascending: bool = True, + na_position: str = "last", + key: Callable | None = None, + mask: npt.NDArray[np.bool_] | None = None, +) -> npt.NDArray[np.intp]: + """ + Intended to be a drop-in replacement for np.argsort which handles NaNs. + + Adds ascending, na_position, and key parameters. + + (GH #6399, #5231, #27237) + + Parameters + ---------- + items : np.ndarray, ExtensionArray, Index, or Series + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + ascending : bool, default True + na_position : {'first', 'last'}, default 'last' + key : Optional[Callable], default None + mask : Optional[np.ndarray[bool]], default None + Passed when called by ExtensionArray.argsort. + + Returns + ------- + np.ndarray[np.intp] + """ + + if key is not None: + # see TestDataFrameSortKey, TestRangeIndex::test_sort_values_key + items = ensure_key_mapped(items, key) + return nargsort( + items, + kind=kind, + ascending=ascending, + na_position=na_position, + key=None, + mask=mask, + ) + + if isinstance(items, ABCRangeIndex): + return items.argsort(ascending=ascending) + elif not isinstance(items, ABCMultiIndex): + items = extract_array(items) + else: + raise TypeError( + "nargsort does not support MultiIndex. Use index.sort_values instead." + ) + + if mask is None: + mask = np.asarray(isna(items)) + + if not isinstance(items, np.ndarray): + # i.e. ExtensionArray + return items.argsort( + ascending=ascending, + kind=kind, + na_position=na_position, + ) + + idx = np.arange(len(items)) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to + # na_position + if na_position == "last": + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == "first": + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError(f"invalid na_position: {na_position}") + return ensure_platform_int(indexer) + + +def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0): + """ + Implementation of np.argmin/argmax but for ExtensionArray and which + handles missing values. + + Parameters + ---------- + values : ExtensionArray + method : {"argmax", "argmin"} + axis : int, default 0 + + Returns + ------- + int + """ + assert method in {"argmax", "argmin"} + func = np.argmax if method == "argmax" else np.argmin + + mask = np.asarray(isna(values)) + arr_values = values._values_for_argsort() + + if arr_values.ndim > 1: + if mask.any(): + if axis == 1: + zipped = zip(arr_values, mask) + else: + zipped = zip(arr_values.T, mask.T) + return np.array([_nanargminmax(v, m, func) for v, m in zipped]) + return func(arr_values, axis=axis) + + return _nanargminmax(arr_values, mask, func) + + +def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int: + """ + See nanargminmax.__doc__. + """ + idx = np.arange(values.shape[0]) + non_nans = values[~mask] + non_nan_idx = idx[~mask] + + return non_nan_idx[func(non_nans)] + + +def _ensure_key_mapped_multiindex( + index: MultiIndex, key: Callable, level=None +) -> MultiIndex: + """ + Returns a new MultiIndex in which key has been applied + to all levels specified in level (or all levels if level + is None). Used for key sorting for MultiIndex. + + Parameters + ---------- + index : MultiIndex + Index to which to apply the key function on the + specified levels. + key : Callable + Function that takes an Index and returns an Index of + the same shape. This key is applied to each level + separately. The name of the level can be used to + distinguish different levels for application. + level : list-like, int or str, default None + Level or list of levels to apply the key function to. + If None, key function is applied to all levels. Other + levels are left unchanged. + + Returns + ------- + labels : MultiIndex + Resulting MultiIndex with modified levels. + """ + + if level is not None: + if isinstance(level, (str, int)): + sort_levels = [level] + else: + sort_levels = level + + sort_levels = [index._get_level_number(lev) for lev in sort_levels] + else: + sort_levels = list(range(index.nlevels)) # satisfies mypy + + mapped = [ + ensure_key_mapped(index._get_level_values(level), key) + if level in sort_levels + else index._get_level_values(level) + for level in range(index.nlevels) + ] + + return type(index).from_arrays(mapped) + + +def ensure_key_mapped( + values: ArrayLike | Index | Series, key: Callable | None, levels=None +) -> ArrayLike | Index | Series: + """ + Applies a callable key function to the values function and checks + that the resulting value has the same shape. Can be called on Index + subclasses, Series, DataFrames, or ndarrays. + + Parameters + ---------- + values : Series, DataFrame, Index subclass, or ndarray + key : Optional[Callable], key to be called on the values array + levels : Optional[List], if values is a MultiIndex, list of levels to + apply the key to. + """ + from pandas.core.indexes.api import Index + + if not key: + return values + + if isinstance(values, ABCMultiIndex): + return _ensure_key_mapped_multiindex(values, key, level=levels) + + result = key(values.copy()) + if len(result) != len(values): + raise ValueError( + "User-provided `key` function must not change the shape of the array." + ) + + try: + if isinstance( + values, Index + ): # convert to a new Index subclass, not necessarily the same + result = Index(result) + else: + # try to revert to original type otherwise + type_of_values = type(values) + # error: Too many arguments for "ExtensionArray" + result = type_of_values(result) # type: ignore[call-arg] + except TypeError: + raise TypeError( + f"User-provided `key` function returned an invalid type {type(result)} \ + which could not be converted to {type(values)}." + ) + + return result + + +def get_flattened_list( + comp_ids: npt.NDArray[np.intp], + ngroups: int, + levels: Iterable[Index], + labels: Iterable[np.ndarray], +) -> list[tuple]: + """Map compressed group id -> key tuple.""" + comp_ids = comp_ids.astype(np.int64, copy=False) + arrays: DefaultDict[int, list[int]] = defaultdict(list) + for labs, level in zip(labels, levels): + table = hashtable.Int64HashTable(ngroups) + table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False)) + for i in range(ngroups): + arrays[i].append(level[table.get_item(i)]) + return [tuple(array) for array in arrays.values()] + + +def get_indexer_dict( + label_list: list[np.ndarray], keys: list[Index] +) -> dict[Hashable, npt.NDArray[np.intp]]: + """ + Returns + ------- + dict: + Labels mapped to indexers. + """ + shape = tuple(len(x) for x in keys) + + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + if np.all(group_index == -1): + # Short-circuit, lib.indices_fast will return the same + return {} + ngroups = ( + ((group_index.size and group_index.max()) + 1) + if is_int64_overflow_possible(shape) + else np.prod(shape, dtype="i8") + ) + + sorter = get_group_index_sorter(group_index, ngroups) + + sorted_labels = [lab.take(sorter) for lab in label_list] + group_index = group_index.take(sorter) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + + +# ---------------------------------------------------------------------- +# sorting levels...cleverly? + + +def get_group_index_sorter( + group_index: npt.NDArray[np.intp], ngroups: int | None = None +) -> npt.NDArray[np.intp]: + """ + algos.groupsort_indexer implements `counting sort` and it is at least + O(ngroups), where + ngroups = prod(shape) + shape = map(len, keys) + that is, linear in the number of combinations (cartesian product) of unique + values of groupby keys. This can be huge when doing multi-key groupby. + np.argsort(kind='mergesort') is O(count x log(count)) where count is the + length of the data-frame; + Both algorithms are `stable` sort and that is necessary for correctness of + groupby operations. e.g. consider: + df.groupby(key)[col].transform('first') + + Parameters + ---------- + group_index : np.ndarray[np.intp] + signed integer dtype + ngroups : int or None, default None + + Returns + ------- + np.ndarray[np.intp] + """ + if ngroups is None: + ngroups = 1 + group_index.max() + count = len(group_index) + alpha = 0.0 # taking complexities literally; there may be + beta = 1.0 # some room for fine-tuning these parameters + do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) + if do_groupsort: + sorter, _ = algos.groupsort_indexer( + ensure_platform_int(group_index), + ngroups, + ) + # sorter _should_ already be intp, but mypy is not yet able to verify + else: + sorter = group_index.argsort(kind="mergesort") + return ensure_platform_int(sorter) + + +def compress_group_index( + group_index: npt.NDArray[np.int64], sort: bool = True +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + if len(group_index) and np.all(group_index[1:] >= group_index[:-1]): + # GH 53806: fast path for sorted group_index + unique_mask = np.concatenate( + [group_index[:1] > -1, group_index[1:] != group_index[:-1]] + ) + comp_ids = unique_mask.cumsum() + comp_ids -= 1 + obs_group_ids = group_index[unique_mask] + else: + size_hint = len(group_index) + table = hashtable.Int64HashTable(size_hint) + + group_index = ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return ensure_int64(comp_ids), ensure_int64(obs_group_ids) + + +def _reorder_by_uniques( + uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]: + """ + Parameters + ---------- + uniques : np.ndarray[np.int64] + labels : np.ndarray[np.intp] + + Returns + ------- + np.ndarray[np.int64] + np.ndarray[np.intp] + """ + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.intp) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = reverse_indexer.take(labels) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = uniques.take(sorter) + + return uniques, labels diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c804b81c49e7c8abb406f2132909df6036df1c09 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/__init__.py @@ -0,0 +1,13 @@ +# ruff: noqa: TCH004 +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # import modules that have public classes/functions + from pandas.io import ( + formats, + json, + stata, + ) + + # mark only those modules as public + __all__ = ["formats", "json", "stata"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/_util.py new file mode 100644 index 0000000000000000000000000000000000000000..3b2ae5daffdbaf515a330a54a83e550751e29fdb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/_util.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import Callable + +from pandas.compat._optional import import_optional_dependency + +import pandas as pd + + +def _arrow_dtype_mapping() -> dict: + pa = import_optional_dependency("pyarrow") + return { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.string(): pd.StringDtype(), + pa.float32(): pd.Float32Dtype(), + pa.float64(): pd.Float64Dtype(), + } + + +def arrow_string_types_mapper() -> Callable: + pa = import_optional_dependency("pyarrow") + + return { + pa.string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + }.get diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/api.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8b34a61dfc62992a37d9fab3263ee00a28d1fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/api.py @@ -0,0 +1,65 @@ +""" +Data IO api +""" + +from pandas.io.clipboards import read_clipboard +from pandas.io.excel import ( + ExcelFile, + ExcelWriter, + read_excel, +) +from pandas.io.feather_format import read_feather +from pandas.io.gbq import read_gbq +from pandas.io.html import read_html +from pandas.io.json import read_json +from pandas.io.orc import read_orc +from pandas.io.parquet import read_parquet +from pandas.io.parsers import ( + read_csv, + read_fwf, + read_table, +) +from pandas.io.pickle import ( + read_pickle, + to_pickle, +) +from pandas.io.pytables import ( + HDFStore, + read_hdf, +) +from pandas.io.sas import read_sas +from pandas.io.spss import read_spss +from pandas.io.sql import ( + read_sql, + read_sql_query, + read_sql_table, +) +from pandas.io.stata import read_stata +from pandas.io.xml import read_xml + +__all__ = [ + "ExcelFile", + "ExcelWriter", + "HDFStore", + "read_clipboard", + "read_csv", + "read_excel", + "read_feather", + "read_fwf", + "read_gbq", + "read_hdf", + "read_html", + "read_json", + "read_orc", + "read_parquet", + "read_pickle", + "read_sas", + "read_spss", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_stata", + "read_table", + "read_xml", + "to_pickle", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboards.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboards.py new file mode 100644 index 0000000000000000000000000000000000000000..a15e37328e9fa95587d53b58b1af10e1e57fd60c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboards.py @@ -0,0 +1,197 @@ +""" io on the clipboard """ +from __future__ import annotations + +from io import StringIO +from typing import TYPE_CHECKING +import warnings + +from pandas._libs import lib +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas import ( + get_option, + option_context, +) + +if TYPE_CHECKING: + from pandas._typing import DtypeBackend + + +def read_clipboard( + sep: str = r"\s+", + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwargs, +): # pragma: no cover + r""" + Read text from clipboard and pass to :func:`~pandas.read_csv`. + + Parses clipboard contents similar to how CSV files are parsed + using :func:`~pandas.read_csv`. + + Parameters + ---------- + sep : str, default '\\s+' + A string or regex delimiter. The default of ``'\\s+'`` denotes + one or more whitespace characters. + + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + **kwargs + See :func:`~pandas.read_csv` for the full argument list. + + Returns + ------- + DataFrame + A parsed :class:`~pandas.DataFrame` object. + + See Also + -------- + DataFrame.to_clipboard : Copy object to the system clipboard. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df.to_clipboard() # doctest: +SKIP + >>> pd.read_clipboard() # doctest: +SKIP + A B C + 0 1 2 3 + 1 4 5 6 + """ + encoding = kwargs.pop("encoding", "utf-8") + + # only utf-8 is valid for passed value because that's what clipboard + # supports + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + + check_dtype_backend(dtype_backend) + + from pandas.io.clipboard import clipboard_get + from pandas.io.parsers import read_csv + + text = clipboard_get() + + # Try to decode (if needed, as "text" might already be a string here). + try: + text = text.decode(kwargs.get("encoding") or get_option("display.encoding")) + except AttributeError: + pass + + # Excel copies into clipboard with \t separation + # inspect no more then the 10 first lines, if they + # all contain an equal number (>0) of tabs, infer + # that this came from excel and set 'sep' accordingly + lines = text[:10000].split("\n")[:-1][:10] + + # Need to remove leading white space, since read_csv + # accepts: + # a b + # 0 1 2 + # 1 3 4 + + counts = {x.lstrip(" ").count("\t") for x in lines} + if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: + sep = "\t" + # check the number of leading tabs in the first line + # to account for index columns + index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) + if index_length != 0: + kwargs.setdefault("index_col", list(range(index_length))) + + # Edge case where sep is specified to be None, return to default + if sep is None and kwargs.get("delim_whitespace") is None: + sep = r"\s+" + + # Regex separator currently only works with python engine. + # Default to python if separator is multi-character (regex) + if len(sep) > 1 and kwargs.get("engine") is None: + kwargs["engine"] = "python" + elif len(sep) > 1 and kwargs.get("engine") == "c": + warnings.warn( + "read_clipboard with regex separator does not work properly with c engine.", + stacklevel=find_stack_level(), + ) + + return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs) + + +def to_clipboard( + obj, excel: bool | None = True, sep: str | None = None, **kwargs +) -> None: # pragma: no cover + """ + Attempt to write text representation of object to the system clipboard + The clipboard can be then pasted into Excel for example. + + Parameters + ---------- + obj : the object to write to the clipboard + excel : bool, defaults to True + if True, use the provided separator, writing in a csv + format for allowing easy pasting into excel. + if False, write a string representation of the object + to the clipboard + sep : optional, defaults to tab + other keywords are passed to to_csv + + Notes + ----- + Requirements for your platform + - Linux: xclip, or xsel (with PyQt4 modules) + - Windows: + - OS X: + """ + encoding = kwargs.pop("encoding", "utf-8") + + # testing if an invalid encoding is passed to clipboard + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise ValueError("clipboard only supports utf-8 encoding") + + from pandas.io.clipboard import clipboard_set + + if excel is None: + excel = True + + if excel: + try: + if sep is None: + sep = "\t" + buf = StringIO() + + # clipboard_set (pyperclip) expects unicode + obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs) + text = buf.getvalue() + + clipboard_set(text) + return + except TypeError: + warnings.warn( + "to_clipboard in excel mode requires a single character separator.", + stacklevel=find_stack_level(), + ) + elif sep is not None: + warnings.warn( + "to_clipboard with excel=False ignores the sep argument.", + stacklevel=find_stack_level(), + ) + + if isinstance(obj, ABCDataFrame): + # str(df) has various unhelpful defaults, like truncation + with option_context("display.max_colwidth", None): + objstr = obj.to_string(**kwargs) + else: + objstr = str(obj) + clipboard_set(objstr) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/common.py new file mode 100644 index 0000000000000000000000000000000000000000..72c9deeb54fc7aaab781b2870171cf983a47da1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/common.py @@ -0,0 +1,1267 @@ +"""Common IO api utilities""" +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +import codecs +from collections import defaultdict +from collections.abc import ( + Hashable, + Mapping, + Sequence, +) +import dataclasses +import functools +import gzip +from io import ( + BufferedIOBase, + BytesIO, + RawIOBase, + StringIO, + TextIOBase, + TextIOWrapper, +) +import mmap +import os +from pathlib import Path +import re +import tarfile +from typing import ( + IO, + TYPE_CHECKING, + Any, + AnyStr, + DefaultDict, + Generic, + Literal, + TypeVar, + cast, + overload, +) +from urllib.parse import ( + urljoin, + urlparse as parse_url, + uses_netloc, + uses_params, + uses_relative, +) +import warnings +import zipfile + +from pandas._typing import ( + BaseBuffer, + ReadCsvBuffer, +) +from pandas.compat import ( + get_bz2_file, + get_lzma_file, +) +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_bool, + is_file_like, + is_integer, + is_list_like, +) +from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas.core.shared_docs import _shared_docs + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard("") +_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") + +BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) + + +if TYPE_CHECKING: + from types import TracebackType + + from pandas._typing import ( + CompressionDict, + CompressionOptions, + FilePath, + ReadBuffer, + StorageOptions, + WriteBuffer, + ) + + from pandas import MultiIndex + + +@dataclasses.dataclass +class IOArgs: + """ + Return value of io/common.py:_get_filepath_or_buffer. + """ + + filepath_or_buffer: str | BaseBuffer + encoding: str + mode: str + compression: CompressionDict + should_close: bool = False + + +@dataclasses.dataclass +class IOHandles(Generic[AnyStr]): + """ + Return value of io/common.py:get_handle + + Can be used as a context manager. + + This is used to easily close created buffers and to handle corner cases when + TextIOWrapper is inserted. + + handle: The file handle to be used. + created_handles: All file handles that are created by get_handle + is_wrapped: Whether a TextIOWrapper needs to be detached. + """ + + # handle might not implement the IO-interface + handle: IO[AnyStr] + compression: CompressionDict + created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list) + is_wrapped: bool = False + + def close(self) -> None: + """ + Close all created buffers. + + Note: If a TextIOWrapper was inserted, it is flushed and detached to + avoid closing the potentially user-created buffer. + """ + if self.is_wrapped: + assert isinstance(self.handle, TextIOWrapper) + self.handle.flush() + self.handle.detach() + self.created_handles.remove(self.handle) + for handle in self.created_handles: + handle.close() + self.created_handles = [] + self.is_wrapped = False + + def __enter__(self) -> IOHandles[AnyStr]: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self.close() + + +def is_url(url: object) -> bool: + """ + Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If `url` has a valid protocol return True otherwise False. + """ + if not isinstance(url, str): + return False + return parse_url(url).scheme in _VALID_URLS + + +@overload +def _expand_user(filepath_or_buffer: str) -> str: + ... + + +@overload +def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT: + ... + + +def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT: + """ + Return the argument with an initial component of ~ or ~user + replaced by that user's home directory. + + Parameters + ---------- + filepath_or_buffer : object to be converted if possible + + Returns + ------- + expanded_filepath_or_buffer : an expanded filepath or the + input if not expandable + """ + if isinstance(filepath_or_buffer, str): + return os.path.expanduser(filepath_or_buffer) + return filepath_or_buffer + + +def validate_header_arg(header: object) -> None: + if header is None: + return + if is_integer(header): + header = cast(int, header) + if header < 0: + # GH 27779 + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + return + if is_list_like(header, allow_sets=False): + header = cast(Sequence, header) + if not all(map(is_integer, header)): + raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in header): + raise ValueError("cannot specify multi-index header with negative integers") + return + if is_bool(header): + raise TypeError( + "Passing a bool to header is invalid. Use header=None for no header or " + "header=int or list-like of ints to specify " + "the row(s) making up the column names" + ) + # GH 16338 + raise ValueError("header must be integer or list of integers") + + +@overload +def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str: + ... + + +@overload +def stringify_path( + filepath_or_buffer: BaseBufferT, convert_file_like: bool = ... +) -> BaseBufferT: + ... + + +def stringify_path( + filepath_or_buffer: FilePath | BaseBufferT, + convert_file_like: bool = False, +) -> str | BaseBufferT: + """ + Attempt to convert a path-like object to a string. + + Parameters + ---------- + filepath_or_buffer : object to be converted + + Returns + ------- + str_filepath_or_buffer : maybe a string version of the object + + Notes + ----- + Objects supporting the fspath protocol are coerced + according to its __fspath__ method. + + Any other object is passed through unchanged, which includes bytes, + strings, buffers, or anything else that's not even path-like. + """ + if not convert_file_like and is_file_like(filepath_or_buffer): + # GH 38125: some fsspec objects implement os.PathLike but have already opened a + # file. This prevents opening the file a second time. infer_compression calls + # this function with convert_file_like=True to infer the compression. + return cast(BaseBufferT, filepath_or_buffer) + + if isinstance(filepath_or_buffer, os.PathLike): + filepath_or_buffer = filepath_or_buffer.__fspath__() + return _expand_user(filepath_or_buffer) + + +def urlopen(*args, **kwargs): + """ + Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of + the stdlib. + """ + import urllib.request + + return urllib.request.urlopen(*args, **kwargs) + + +def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: + """ + Returns true if the given URL looks like + something fsspec can handle + """ + return ( + isinstance(url, str) + and bool(_RFC_3986_PATTERN.match(url)) + and not url.startswith(("http://", "https://")) + ) + + +@doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "filepath_or_buffer", +) +def _get_filepath_or_buffer( + filepath_or_buffer: FilePath | BaseBuffer, + encoding: str = "utf-8", + compression: CompressionOptions | None = None, + mode: str = "r", + storage_options: StorageOptions | None = None, +) -> IOArgs: + """ + If the filepath_or_buffer is a url, translate and return the buffer. + Otherwise passthrough. + + Parameters + ---------- + filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), + or buffer + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + encoding : the encoding to use to decode bytes, default is 'utf-8' + mode : str, optional + + {storage_options} + + + Returns the dataclass IOArgs. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + # handle compression dict + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(filepath_or_buffer, compression_method) + + # GH21227 internal compression is not used for non-binary handles. + if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: + warnings.warn( + "compression has no effect when passing a non-binary object as input.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + compression_method = None + + compression = dict(compression, method=compression_method) + + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 + # print a warning when writing such files + if ( + "w" in mode + and compression_method in ["bz2", "xz"] + and encoding in ["utf-16", "utf-32"] + ): + warnings.warn( + f"{compression} will not write the byte order mark for {encoding}", + UnicodeWarning, + stacklevel=find_stack_level(), + ) + + # Use binary mode when converting path-like objects to file-like objects (fsspec) + # except when text mode is explicitly requested. The original mode is returned if + # fsspec is not used. + fsspec_mode = mode + if "t" not in fsspec_mode and "b" not in fsspec_mode: + fsspec_mode += "b" + + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): + # TODO: fsspec can also handle HTTP via requests, but leaving this + # unchanged. using fsspec appears to break the ability to infer if the + # server responded with gzipped data + storage_options = storage_options or {} + + # waiting until now for importing to match intended lazy logic of + # urlopen function defined elsewhere in this module + import urllib.request + + # assuming storage_options is to be interpreted as headers + req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options) + with urlopen(req_info) as req: + content_encoding = req.headers.get("Content-Encoding", None) + if content_encoding == "gzip": + # Override compression based on Content-Encoding header + compression = {"method": "gzip"} + reader = BytesIO(req.read()) + return IOArgs( + filepath_or_buffer=reader, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) + + if is_fsspec_url(filepath_or_buffer): + assert isinstance( + filepath_or_buffer, str + ) # just to appease mypy for this branch + # two special-case s3-like protocols; these have special meaning in Hadoop, + # but are equivalent to just "s3" from fsspec's point of view + # cc #11071 + if filepath_or_buffer.startswith("s3a://"): + filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") + if filepath_or_buffer.startswith("s3n://"): + filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") + fsspec = import_optional_dependency("fsspec") + + # If botocore is installed we fallback to reading with anon=True + # to allow reads from public buckets + err_types_to_retry_with_anon: list[Any] = [] + try: + import_optional_dependency("botocore") + from botocore.exceptions import ( + ClientError, + NoCredentialsError, + ) + + err_types_to_retry_with_anon = [ + ClientError, + NoCredentialsError, + PermissionError, + ] + except ImportError: + pass + + try: + file_obj = fsspec.open( + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + ).open() + # GH 34626 Reads from Public Buckets without Credentials needs anon=True + except tuple(err_types_to_retry_with_anon): + if storage_options is None: + storage_options = {"anon": True} + else: + # don't mutate user input. + storage_options = dict(storage_options) + storage_options["anon"] = True + file_obj = fsspec.open( + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + ).open() + + return IOArgs( + filepath_or_buffer=file_obj, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) + + if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): + return IOArgs( + filepath_or_buffer=_expand_user(filepath_or_buffer), + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) + + # is_file_like requires (read | write) & __iter__ but __iter__ is only + # needed for read_csv(engine=python) + if not ( + hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write") + ): + msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" + raise ValueError(msg) + + return IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) + + +def file_path_to_url(path: str) -> str: + """ + converts an absolute native path to a FILE URL. + + Parameters + ---------- + path : a path in native format + + Returns + ------- + a valid FILE URL + """ + # lazify expensive import (~30ms) + from urllib.request import pathname2url + + return urljoin("file:", pathname2url(path)) + + +extension_to_compression = { + ".tar": "tar", + ".tar.gz": "tar", + ".tar.bz2": "tar", + ".tar.xz": "tar", + ".gz": "gzip", + ".bz2": "bz2", + ".zip": "zip", + ".xz": "xz", + ".zst": "zstd", +} +_supported_compressions = set(extension_to_compression.values()) + + +def get_compression_method( + compression: CompressionOptions, +) -> tuple[str | None, CompressionDict]: + """ + Simplifies a compression argument to a compression method string and + a mapping containing additional arguments. + + Parameters + ---------- + compression : str or mapping + If string, specifies the compression method. If mapping, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, Any]) + + Raises + ------ + ValueError on mapping missing 'method' key + """ + compression_method: str | None + if isinstance(compression, Mapping): + compression_args = dict(compression) + try: + compression_method = compression_args.pop("method") + except KeyError as err: + raise ValueError("If mapping, compression must have key 'method'") from err + else: + compression_args = {} + compression_method = compression + return compression_method, compression_args + + +@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer") +def infer_compression( + filepath_or_buffer: FilePath | BaseBuffer, compression: str | None +) -> str | None: + """ + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input + compression method is returned unchanged, unless it's invalid, in which + case an error is raised. + + Parameters + ---------- + filepath_or_buffer : str or file handle + File path or object. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + Returns + ------- + string or None + + Raises + ------ + ValueError on invalid compression specified. + """ + if compression is None: + return None + + # Infer compression + if compression == "infer": + # Convert all path types (e.g. pathlib.Path) to strings + filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) + if not isinstance(filepath_or_buffer, str): + # Cannot infer compression of a buffer, assume no compression + return None + + # Infer compression from the filename/URL extension + for extension, compression in extension_to_compression.items(): + if filepath_or_buffer.lower().endswith(extension): + return compression + return None + + # Compression has been specified. Check that it's valid + if compression in _supported_compressions: + return compression + + valid = ["infer", None] + sorted(_supported_compressions) + msg = ( + f"Unrecognized compression type: {compression}\n" + f"Valid compression types are {valid}" + ) + raise ValueError(msg) + + +def check_parent_directory(path: Path | str) -> None: + """ + Check if parent directory of a file exists, raise OSError if it does not + + Parameters + ---------- + path: Path or str + Path to check parent directory of + """ + parent = Path(path).parent + if not parent.is_dir(): + raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'") + + +@overload +def get_handle( + path_or_buf: FilePath | BaseBuffer, + mode: str, + *, + encoding: str | None = ..., + compression: CompressionOptions = ..., + memory_map: bool = ..., + is_text: Literal[False], + errors: str | None = ..., + storage_options: StorageOptions = ..., +) -> IOHandles[bytes]: + ... + + +@overload +def get_handle( + path_or_buf: FilePath | BaseBuffer, + mode: str, + *, + encoding: str | None = ..., + compression: CompressionOptions = ..., + memory_map: bool = ..., + is_text: Literal[True] = ..., + errors: str | None = ..., + storage_options: StorageOptions = ..., +) -> IOHandles[str]: + ... + + +@overload +def get_handle( + path_or_buf: FilePath | BaseBuffer, + mode: str, + *, + encoding: str | None = ..., + compression: CompressionOptions = ..., + memory_map: bool = ..., + is_text: bool = ..., + errors: str | None = ..., + storage_options: StorageOptions = ..., +) -> IOHandles[str] | IOHandles[bytes]: + ... + + +@doc(compression_options=_shared_docs["compression_options"] % "path_or_buf") +def get_handle( + path_or_buf: FilePath | BaseBuffer, + mode: str, + *, + encoding: str | None = None, + compression: CompressionOptions | None = None, + memory_map: bool = False, + is_text: bool = True, + errors: str | None = None, + storage_options: StorageOptions | None = None, +) -> IOHandles[str] | IOHandles[bytes]: + """ + Get file handle for given path/buffer and mode. + + Parameters + ---------- + path_or_buf : str or file handle + File path or object. + mode : str + Mode to open path_or_buf with. + encoding : str or None + Encoding to use. + {compression_options} + + May be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + + Passing compression options as keys in dict is + supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'. + + .. versionchanged:: 1.4.0 Zstandard support. + + memory_map : bool, default False + See parsers._parser_params for more information. Only used by read_csv. + is_text : bool, default True + Whether the type of the content passed to the file/buffer is string or + bytes. This is not the same as `"b" not in mode`. If a string content is + passed to a binary file/buffer, a wrapper is inserted. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + storage_options: StorageOptions = None + Passed to _get_filepath_or_buffer + + Returns the dataclass IOHandles + """ + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior + encoding = encoding or "utf-8" + + errors = errors or "strict" + + # read_csv does not know whether the buffer is opened in binary/text mode + if _is_binary_mode(path_or_buf, mode) and "b" not in mode: + mode += "b" + + # validate encoding and errors + codecs.lookup(encoding) + if isinstance(errors, str): + codecs.lookup_error(errors) + + # open URLs + ioargs = _get_filepath_or_buffer( + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, + ) + + handle = ioargs.filepath_or_buffer + handles: list[BaseBuffer] + + # memory mapping needs to be the first step + # only used for read_csv + handle, memory_map, handles = _maybe_memory_map(handle, memory_map) + + is_path = isinstance(handle, str) + compression_args = dict(ioargs.compression) + compression = compression_args.pop("method") + + # Only for write methods + if "r" not in mode and is_path: + check_parent_directory(str(handle)) + + if compression: + if compression != "zstd": + # compression libraries do not like an explicit text-mode + ioargs.mode = ioargs.mode.replace("t", "") + elif compression == "zstd" and "b" not in ioargs.mode: + # python-zstandard defaults to text mode, but we always expect + # compression libraries to use binary mode. + ioargs.mode += "b" + + # GZ Compression + if compression == "gzip": + if isinstance(handle, str): + # error: Incompatible types in assignment (expression has type + # "GzipFile", variable has type "Union[str, BaseBuffer]") + handle = gzip.GzipFile( # type: ignore[assignment] + filename=handle, + mode=ioargs.mode, + **compression_args, + ) + else: + handle = gzip.GzipFile( + # No overload variant of "GzipFile" matches argument types + # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" + fileobj=handle, # type: ignore[call-overload] + mode=ioargs.mode, + **compression_args, + ) + + # BZ Compression + elif compression == "bz2": + # Overload of "BZ2File" to handle pickle protocol 5 + # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" + handle = get_bz2_file()( # type: ignore[call-overload] + handle, + mode=ioargs.mode, + **compression_args, + ) + + # ZIP Compression + elif compression == "zip": + # error: Argument 1 to "_BytesZipFile" has incompatible type + # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]], + # ReadBuffer[bytes], WriteBuffer[bytes]]" + handle = _BytesZipFile( + handle, ioargs.mode, **compression_args # type: ignore[arg-type] + ) + if handle.buffer.mode == "r": + handles.append(handle) + zip_names = handle.buffer.namelist() + if len(zip_names) == 1: + handle = handle.buffer.open(zip_names.pop()) + elif not zip_names: + raise ValueError(f"Zero files found in ZIP file {path_or_buf}") + else: + raise ValueError( + "Multiple files found in ZIP file. " + f"Only one file per ZIP: {zip_names}" + ) + + # TAR Encoding + elif compression == "tar": + compression_args.setdefault("mode", ioargs.mode) + if isinstance(handle, str): + handle = _BytesTarFile(name=handle, **compression_args) + else: + # error: Argument "fileobj" to "_BytesTarFile" has incompatible + # type "BaseBuffer"; expected "Union[ReadBuffer[bytes], + # WriteBuffer[bytes], None]" + handle = _BytesTarFile( + fileobj=handle, **compression_args # type: ignore[arg-type] + ) + assert isinstance(handle, _BytesTarFile) + if "r" in handle.buffer.mode: + handles.append(handle) + files = handle.buffer.getnames() + if len(files) == 1: + file = handle.buffer.extractfile(files[0]) + assert file is not None + handle = file + elif not files: + raise ValueError(f"Zero files found in TAR archive {path_or_buf}") + else: + raise ValueError( + "Multiple files found in TAR archive. " + f"Only one file per TAR archive: {files}" + ) + + # XZ Compression + elif compression == "xz": + # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, + # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], + # PathLike[bytes]], IO[bytes]], None]" + handle = get_lzma_file()( + handle, ioargs.mode, **compression_args # type: ignore[arg-type] + ) + + # Zstd Compression + elif compression == "zstd": + zstd = import_optional_dependency("zstandard") + if "r" in ioargs.mode: + open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)} + else: + open_args = {"cctx": zstd.ZstdCompressor(**compression_args)} + handle = zstd.open( + handle, + mode=ioargs.mode, + **open_args, + ) + + # Unrecognized Compression + else: + msg = f"Unrecognized compression type: {compression}" + raise ValueError(msg) + + assert not isinstance(handle, str) + handles.append(handle) + + elif isinstance(handle, str): + # Check whether the filename is to be opened in binary mode. + # Binary mode does not support 'encoding' and 'newline'. + if ioargs.encoding and "b" not in ioargs.mode: + # Encoding + handle = open( + handle, + ioargs.mode, + encoding=ioargs.encoding, + errors=errors, + newline="", + ) + else: + # Binary mode + handle = open(handle, ioargs.mode) + handles.append(handle) + + # Convert BytesIO or file objects passed with an encoding + is_wrapped = False + if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): + # not added to handles as it does not open/buffer resources + handle = _BytesIOWrapper( + handle, + encoding=ioargs.encoding, + ) + elif is_text and ( + compression or memory_map or _is_binary_mode(handle, ioargs.mode) + ): + if ( + not hasattr(handle, "readable") + or not hasattr(handle, "writable") + or not hasattr(handle, "seekable") + ): + handle = _IOWrapper(handle) + # error: Argument 1 to "TextIOWrapper" has incompatible type + # "_IOWrapper"; expected "IO[bytes]" + handle = TextIOWrapper( + handle, # type: ignore[arg-type] + encoding=ioargs.encoding, + errors=errors, + newline="", + ) + handles.append(handle) + # only marked as wrapped when the caller provided a handle + is_wrapped = not ( + isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close + ) + + if "r" in ioargs.mode and not hasattr(handle, "read"): + raise TypeError( + "Expected file path name or file-like object, " + f"got {type(ioargs.filepath_or_buffer)} type" + ) + + handles.reverse() # close the most recently added buffer first + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + handles.append(ioargs.filepath_or_buffer) + + return IOHandles( + # error: Argument "handle" to "IOHandles" has incompatible type + # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes], + # typing.IO[Any]]"; expected "pandas._typing.IO[Any]" + handle=handle, # type: ignore[arg-type] + # error: Argument "created_handles" to "IOHandles" has incompatible type + # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]" + created_handles=handles, # type: ignore[arg-type] + is_wrapped=is_wrapped, + compression=ioargs.compression, + ) + + +# error: Definition of "__enter__" in base class "IOBase" is incompatible +# with definition in base class "BinaryIO" +class _BufferedWriter(BytesIO, ABC): # type: ignore[misc] + """ + Some objects do not support multiple .write() calls (TarFile and ZipFile). + This wrapper writes to the underlying buffer on close. + """ + + buffer = BytesIO() + + @abstractmethod + def write_to_buffer(self) -> None: + ... + + def close(self) -> None: + if self.closed: + # already closed + return + if self.getbuffer().nbytes: + # write to buffer + self.seek(0) + with self.buffer: + self.write_to_buffer() + else: + self.buffer.close() + super().close() + + +class _BytesTarFile(_BufferedWriter): + def __init__( + self, + name: str | None = None, + mode: Literal["r", "a", "w", "x"] = "r", + fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = None, + archive_name: str | None = None, + **kwargs, + ) -> None: + super().__init__() + self.archive_name = archive_name + self.name = name + # error: Incompatible types in assignment (expression has type "TarFile", + # base class "_BufferedWriter" defined the type as "BytesIO") + self.buffer: tarfile.TarFile = tarfile.TarFile.open( # type: ignore[assignment] + name=name, + mode=self.extend_mode(mode), + fileobj=fileobj, + **kwargs, + ) + + def extend_mode(self, mode: str) -> str: + mode = mode.replace("b", "") + if mode != "w": + return mode + if self.name is not None: + suffix = Path(self.name).suffix + if suffix in (".gz", ".xz", ".bz2"): + mode = f"{mode}:{suffix[1:]}" + return mode + + def infer_filename(self) -> str | None: + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.tar, because that causes confusion (GH39465). + """ + if self.name is None: + return None + + filename = Path(self.name) + if filename.suffix == ".tar": + return filename.with_suffix("").name + elif filename.suffix in (".tar.gz", ".tar.bz2", ".tar.xz"): + return filename.with_suffix("").with_suffix("").name + return filename.name + + def write_to_buffer(self) -> None: + # TarFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "tar" + tarinfo = tarfile.TarInfo(name=archive_name) + tarinfo.size = len(self.getvalue()) + self.buffer.addfile(tarinfo, self) + + +class _BytesZipFile(_BufferedWriter): + def __init__( + self, + file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], + mode: str, + archive_name: str | None = None, + **kwargs, + ) -> None: + super().__init__() + mode = mode.replace("b", "") + self.archive_name = archive_name + + kwargs.setdefault("compression", zipfile.ZIP_DEFLATED) + # error: Incompatible types in assignment (expression has type "ZipFile", + # base class "_BufferedWriter" defined the type as "BytesIO") + self.buffer: zipfile.ZipFile = zipfile.ZipFile( # type: ignore[assignment] + file, mode, **kwargs + ) + + def infer_filename(self) -> str | None: + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.zip, because that causes confusion (GH39465). + """ + if isinstance(self.buffer.filename, (os.PathLike, str)): + filename = Path(self.buffer.filename) + if filename.suffix == ".zip": + return filename.with_suffix("").name + return filename.name + return None + + def write_to_buffer(self) -> None: + # ZipFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "zip" + self.buffer.writestr(archive_name, self.getvalue()) + + +class _IOWrapper: + # TextIOWrapper is overly strict: it request that the buffer has seekable, readable, + # and writable. If we have a read-only buffer, we shouldn't need writable and vice + # versa. Some buffers, are seek/read/writ-able but they do not have the "-able" + # methods, e.g., tempfile.SpooledTemporaryFile. + # If a buffer does not have the above "-able" methods, we simple assume they are + # seek/read/writ-able. + def __init__(self, buffer: BaseBuffer) -> None: + self.buffer = buffer + + def __getattr__(self, name: str): + return getattr(self.buffer, name) + + def readable(self) -> bool: + if hasattr(self.buffer, "readable"): + return self.buffer.readable() + return True + + def seekable(self) -> bool: + if hasattr(self.buffer, "seekable"): + return self.buffer.seekable() + return True + + def writable(self) -> bool: + if hasattr(self.buffer, "writable"): + return self.buffer.writable() + return True + + +class _BytesIOWrapper: + # Wrapper that wraps a StringIO buffer and reads bytes from it + # Created for compat with pyarrow read_csv + def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8") -> None: + self.buffer = buffer + self.encoding = encoding + # Because a character can be represented by more than 1 byte, + # it is possible that reading will produce more bytes than n + # We store the extra bytes in this overflow variable, and append the + # overflow to the front of the bytestring the next time reading is performed + self.overflow = b"" + + def __getattr__(self, attr: str): + return getattr(self.buffer, attr) + + def read(self, n: int | None = -1) -> bytes: + assert self.buffer is not None + bytestring = self.buffer.read(n).encode(self.encoding) + # When n=-1/n greater than remaining bytes: Read entire file/rest of file + combined_bytestring = self.overflow + bytestring + if n is None or n < 0 or n >= len(combined_bytestring): + self.overflow = b"" + return combined_bytestring + else: + to_return = combined_bytestring[:n] + self.overflow = combined_bytestring[n:] + return to_return + + +def _maybe_memory_map( + handle: str | BaseBuffer, memory_map: bool +) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]: + """Try to memory map file/buffer.""" + handles: list[BaseBuffer] = [] + memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) + if not memory_map: + return handle, memory_map, handles + + # mmap used by only read_csv + handle = cast(ReadCsvBuffer, handle) + + # need to open the file first + if isinstance(handle, str): + handle = open(handle, "rb") + handles.append(handle) + + try: + # open mmap and adds *-able + # error: Argument 1 to "_IOWrapper" has incompatible type "mmap"; + # expected "BaseBuffer" + wrapped = _IOWrapper( + mmap.mmap( + handle.fileno(), 0, access=mmap.ACCESS_READ # type: ignore[arg-type] + ) + ) + finally: + for handle in reversed(handles): + # error: "BaseBuffer" has no attribute "close" + handle.close() # type: ignore[attr-defined] + + return wrapped, memory_map, [wrapped] + + +def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool: + """Test whether file exists.""" + exists = False + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + return exists + try: + exists = os.path.exists(filepath_or_buffer) + # gh-5874: if the filepath is too long will raise here + except (TypeError, ValueError): + pass + return exists + + +def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: + """Whether the handle is opened in binary mode""" + # specified by user + if "t" in mode or "b" in mode: + return "b" in mode + + # exceptions + text_classes = ( + # classes that expect string but have 'b' in mode + codecs.StreamWriter, + codecs.StreamReader, + codecs.StreamReaderWriter, + ) + if issubclass(type(handle), text_classes): + return False + + return isinstance(handle, _get_binary_io_classes()) or "b" in getattr( + handle, "mode", mode + ) + + +@functools.lru_cache +def _get_binary_io_classes() -> tuple[type, ...]: + """IO classes that that expect bytes""" + binary_classes: tuple[type, ...] = (BufferedIOBase, RawIOBase) + + # python-zstandard doesn't use any of the builtin base classes; instead we + # have to use the `zstd.ZstdDecompressionReader` class for isinstance checks. + # Unfortunately `zstd.ZstdDecompressionReader` isn't exposed by python-zstandard + # so we have to get it from a `zstd.ZstdDecompressor` instance. + # See also https://github.com/indygreg/python-zstandard/pull/165. + zstd = import_optional_dependency("zstandard", errors="ignore") + if zstd is not None: + with zstd.ZstdDecompressor().stream_reader(b"") as reader: + binary_classes += (type(reader),) + + return binary_classes + + +def is_potential_multi_index( + columns: Sequence[Hashable] | MultiIndex, + index_col: bool | Sequence[int] | None = None, +) -> bool: + """ + Check whether or not the `columns` parameter + could be converted into a MultiIndex. + + Parameters + ---------- + columns : array-like + Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index + + Returns + ------- + bool : Whether or not columns could become a MultiIndex + """ + if index_col is None or isinstance(index_col, bool): + index_col = [] + + return bool( + len(columns) + and not isinstance(columns, ABCMultiIndex) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) + ) + + +def dedup_names( + names: Sequence[Hashable], is_potential_multiindex: bool +) -> Sequence[Hashable]: + """ + Rename column names if duplicates exist. + + Currently the renaming is done by appending a period and an autonumeric, + but a custom pattern may be supported in the future. + + Examples + -------- + >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) + ['x', 'y', 'x.1', 'x.2'] + """ + names = list(names) # so we can index + counts: DefaultDict[Hashable, int] = defaultdict(int) + + for i, col in enumerate(names): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + + if is_potential_multiindex: + # for mypy + assert isinstance(col, tuple) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] + + names[i] = col + counts[col] = cur_count + 1 + + return names diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/orc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/orc.py new file mode 100644 index 0000000000000000000000000000000000000000..fed9463c38d5deb907cb7df0adee03152380d7a0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/orc.py @@ -0,0 +1,245 @@ +""" orc compat """ +from __future__ import annotations + +import io +from types import ModuleType +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend + +import pandas as pd +from pandas.core.indexes.api import default_index + +from pandas.io._util import arrow_string_types_mapper +from pandas.io.common import ( + get_handle, + is_fsspec_url, +) + +if TYPE_CHECKING: + import fsspec + import pyarrow.fs + + from pandas._typing import ( + DtypeBackend, + FilePath, + ReadBuffer, + WriteBuffer, + ) + + from pandas.core.frame import DataFrame + + +def read_orc( + path: FilePath | ReadBuffer[bytes], + columns: list[str] | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None, + **kwargs: Any, +) -> DataFrame: + """ + Load an ORC object from the file path, returning a DataFrame. + + Parameters + ---------- + path : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.orc``. + columns : list, default None + If not None, only these columns will be read from the file. + Output always follows the ordering of the file and not the columns list. + This mirrors the original behaviour of + :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + filesystem : fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. + + .. versionadded:: 2.1.0 + + **kwargs + Any additional kwargs are passed to pyarrow. + + Returns + ------- + DataFrame + + Notes + ----- + Before using this function you should read the :ref:`user guide about ORC ` + and :ref:`install optional dependencies `. + + If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"), + a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a + pyarrow or fsspec filesystem object into the filesystem keyword to override this + behavior. + + Examples + -------- + >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP + """ + # we require a newer version of pyarrow than we support for parquet + + orc = import_optional_dependency("pyarrow.orc") + + check_dtype_backend(dtype_backend) + + with get_handle(path, "rb", is_text=False) as handles: + source = handles.handle + if is_fsspec_url(path) and filesystem is None: + pa = import_optional_dependency("pyarrow") + pa_fs = import_optional_dependency("pyarrow.fs") + try: + filesystem, source = pa_fs.FileSystem.from_uri(path) + except (TypeError, pa.ArrowInvalid): + pass + + pa_table = orc.read_table( + source=source, columns=columns, filesystem=filesystem, **kwargs + ) + if dtype_backend is not lib.no_default: + if dtype_backend == "pyarrow": + df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) + else: + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + df = pa_table.to_pandas(types_mapper=mapping.get) + return df + else: + if using_pyarrow_string_dtype(): + types_mapper = arrow_string_types_mapper() + else: + types_mapper = None + return pa_table.to_pandas(types_mapper=types_mapper) + + +def to_orc( + df: DataFrame, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, +) -> bytes | None: + """ + Write a DataFrame to the ORC format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + df : DataFrame + The dataframe to be written to ORC. Raises NotImplementedError + if dtype of one or more columns is category, unsigned integers, + intervals, periods or sparse. + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : str, default 'pyarrow' + ORC library to use. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no path argument is provided else None + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + Notes + ----- + * Before using this function you should read the + :ref:`user guide about ORC ` and + :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + """ + if index is None: + index = df.index.names[0] is not None + if engine_kwargs is None: + engine_kwargs = {} + + # validate index + # -------------- + + # validate that we have only a default index + # raise on anything else as we don't serialize the index + + if not df.index.equals(default_index(len(df))): + raise ValueError( + "orc does not support serializing a non-default index for the index; " + "you can .reset_index() to make the index into column(s)" + ) + + if df.index.name is not None: + raise ValueError("orc does not serialize index meta-data on a default index") + + if engine != "pyarrow": + raise ValueError("engine must be 'pyarrow'") + engine = import_optional_dependency(engine, min_version="10.0.1") + pa = import_optional_dependency("pyarrow") + orc = import_optional_dependency("pyarrow.orc") + + was_none = path is None + if was_none: + path = io.BytesIO() + assert path is not None # For mypy + with get_handle(path, "wb", is_text=False) as handles: + assert isinstance(engine, ModuleType) # For mypy + try: + orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + handles.handle, + **engine_kwargs, + ) + except (TypeError, pa.ArrowNotImplementedError) as e: + raise NotImplementedError( + "The dtype of one or more columns is not supported yet." + ) from e + + if was_none: + assert isinstance(path, io.BytesIO) # For mypy + return path.getvalue() + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parquet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parquet.py new file mode 100644 index 0000000000000000000000000000000000000000..9570d6f8b26bd85585e5a46145b7871f1bb6eb3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parquet.py @@ -0,0 +1,676 @@ +""" parquet compat """ +from __future__ import annotations + +import io +import json +import os +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) +import warnings +from warnings import catch_warnings + +from pandas._config import using_pyarrow_string_dtype +from pandas._config.config import _get_option + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +import pandas as pd +from pandas import ( + DataFrame, + get_option, +) +from pandas.core.shared_docs import _shared_docs + +from pandas.io._util import arrow_string_types_mapper +from pandas.io.common import ( + IOHandles, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) + +if TYPE_CHECKING: + from pandas._typing import ( + DtypeBackend, + FilePath, + ReadBuffer, + StorageOptions, + WriteBuffer, + ) + + +def get_engine(engine: str) -> BaseImpl: + """return our implementation""" + if engine == "auto": + engine = get_option("io.parquet.engine") + + if engine == "auto": + # try engines in this order + engine_classes = [PyArrowImpl, FastParquetImpl] + + error_msgs = "" + for engine_class in engine_classes: + try: + return engine_class() + except ImportError as err: + error_msgs += "\n - " + str(err) + + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "A suitable version of " + "pyarrow or fastparquet is required for parquet " + "support.\n" + "Trying to import the above resulted in these errors:" + f"{error_msgs}" + ) + + if engine == "pyarrow": + return PyArrowImpl() + elif engine == "fastparquet": + return FastParquetImpl() + + raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") + + +def _get_path_or_handle( + path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], + fs: Any, + storage_options: StorageOptions | None = None, + mode: str = "rb", + is_dir: bool = False, +) -> tuple[ + FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any +]: + """File handling for PyArrow.""" + path_or_handle = stringify_path(path) + if fs is not None: + pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore") + fsspec = import_optional_dependency("fsspec", errors="ignore") + if pa_fs is not None and isinstance(fs, pa_fs.FileSystem): + if storage_options: + raise NotImplementedError( + "storage_options not supported with a pyarrow FileSystem." + ) + elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem): + pass + else: + raise ValueError( + f"filesystem must be a pyarrow or fsspec FileSystem, " + f"not a {type(fs).__name__}" + ) + if is_fsspec_url(path_or_handle) and fs is None: + if storage_options is None: + pa = import_optional_dependency("pyarrow") + pa_fs = import_optional_dependency("pyarrow.fs") + + try: + fs, path_or_handle = pa_fs.FileSystem.from_uri(path) + except (TypeError, pa.ArrowInvalid): + pass + if fs is None: + fsspec = import_optional_dependency("fsspec") + fs, path_or_handle = fsspec.core.url_to_fs( + path_or_handle, **(storage_options or {}) + ) + elif storage_options and (not is_url(path_or_handle) or mode != "rb"): + # can't write to a remote url + # without making use of fsspec at the moment + raise ValueError("storage_options passed with buffer, or non-supported URL") + + handles = None + if ( + not fs + and not is_dir + and isinstance(path_or_handle, str) + and not os.path.isdir(path_or_handle) + ): + # use get_handle only when we are very certain that it is not a directory + # fsspec resources can also point to directories + # this branch is used for example when reading from non-fsspec URLs + handles = get_handle( + path_or_handle, mode, is_text=False, storage_options=storage_options + ) + fs = None + path_or_handle = handles.handle + return path_or_handle, handles, fs + + +class BaseImpl: + @staticmethod + def validate_dataframe(df: DataFrame) -> None: + if not isinstance(df, DataFrame): + raise ValueError("to_parquet only supports IO with DataFrames") + + def write(self, df: DataFrame, path, compression, **kwargs): + raise AbstractMethodError(self) + + def read(self, path, columns=None, **kwargs) -> DataFrame: + raise AbstractMethodError(self) + + +class PyArrowImpl(BaseImpl): + def __init__(self) -> None: + import_optional_dependency( + "pyarrow", extra="pyarrow is required for parquet support." + ) + import pyarrow.parquet + + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 + + self.api = pyarrow + + def write( + self, + df: DataFrame, + path: FilePath | WriteBuffer[bytes], + compression: str | None = "snappy", + index: bool | None = None, + storage_options: StorageOptions | None = None, + partition_cols: list[str] | None = None, + filesystem=None, + **kwargs, + ) -> None: + self.validate_dataframe(df) + + from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)} + if index is not None: + from_pandas_kwargs["preserve_index"] = index + + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + + if df.attrs: + df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} + existing_metadata = table.schema.metadata + merged_metadata = {**existing_metadata, **df_metadata} + table = table.replace_schema_metadata(merged_metadata) + + path_or_handle, handles, filesystem = _get_path_or_handle( + path, + filesystem, + storage_options=storage_options, + mode="wb", + is_dir=partition_cols is not None, + ) + if ( + isinstance(path_or_handle, io.BufferedWriter) + and hasattr(path_or_handle, "name") + and isinstance(path_or_handle.name, (str, bytes)) + ): + if isinstance(path_or_handle.name, bytes): + path_or_handle = path_or_handle.name.decode() + else: + path_or_handle = path_or_handle.name + + try: + if partition_cols is not None: + # writes to multiple files under the given path + self.api.parquet.write_to_dataset( + table, + path_or_handle, + compression=compression, + partition_cols=partition_cols, + filesystem=filesystem, + **kwargs, + ) + else: + # write to single output file + self.api.parquet.write_table( + table, + path_or_handle, + compression=compression, + filesystem=filesystem, + **kwargs, + ) + finally: + if handles is not None: + handles.close() + + def read( + self, + path, + columns=None, + filters=None, + use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + storage_options: StorageOptions | None = None, + filesystem=None, + **kwargs, + ) -> DataFrame: + kwargs["use_pandas_metadata"] = True + + to_pandas_kwargs = {} + if dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + to_pandas_kwargs["types_mapper"] = mapping.get + elif dtype_backend == "pyarrow": + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] + elif using_pyarrow_string_dtype(): + to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() + + manager = _get_option("mode.data_manager", silent=True) + if manager == "array": + to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] + + path_or_handle, handles, filesystem = _get_path_or_handle( + path, + filesystem, + storage_options=storage_options, + mode="rb", + ) + try: + pa_table = self.api.parquet.read_table( + path_or_handle, + columns=columns, + filesystem=filesystem, + filters=filters, + **kwargs, + ) + result = pa_table.to_pandas(**to_pandas_kwargs) + + if manager == "array": + result = result._as_manager("array", copy=False) + + if pa_table.schema.metadata: + if b"PANDAS_ATTRS" in pa_table.schema.metadata: + df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"] + result.attrs = json.loads(df_metadata) + return result + finally: + if handles is not None: + handles.close() + + +class FastParquetImpl(BaseImpl): + def __init__(self) -> None: + # since pandas is a dependency of fastparquet + # we need to import on first use + fastparquet = import_optional_dependency( + "fastparquet", extra="fastparquet is required for parquet support." + ) + self.api = fastparquet + + def write( + self, + df: DataFrame, + path, + compression: Literal["snappy", "gzip", "brotli"] | None = "snappy", + index=None, + partition_cols=None, + storage_options: StorageOptions | None = None, + filesystem=None, + **kwargs, + ) -> None: + self.validate_dataframe(df) + + if "partition_on" in kwargs and partition_cols is not None: + raise ValueError( + "Cannot use both partition_on and " + "partition_cols. Use partition_cols for partitioning data" + ) + if "partition_on" in kwargs: + partition_cols = kwargs.pop("partition_on") + + if partition_cols is not None: + kwargs["file_scheme"] = "hive" + + if filesystem is not None: + raise NotImplementedError( + "filesystem is not implemented for the fastparquet engine." + ) + + # cannot use get_handle as write() does not accept file buffers + path = stringify_path(path) + if is_fsspec_url(path): + fsspec = import_optional_dependency("fsspec") + + # if filesystem is provided by fsspec, file must be opened in 'wb' mode. + kwargs["open_with"] = lambda path, _: fsspec.open( + path, "wb", **(storage_options or {}) + ).open() + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) + + with catch_warnings(record=True): + self.api.write( + path, + df, + compression=compression, + write_index=index, + partition_on=partition_cols, + **kwargs, + ) + + def read( + self, + path, + columns=None, + filters=None, + storage_options: StorageOptions | None = None, + filesystem=None, + **kwargs, + ) -> DataFrame: + parquet_kwargs: dict[str, Any] = {} + use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + dtype_backend = kwargs.pop("dtype_backend", lib.no_default) + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) + if dtype_backend is not lib.no_default: + raise ValueError( + "The 'dtype_backend' argument is not supported for the " + "fastparquet engine" + ) + if filesystem is not None: + raise NotImplementedError( + "filesystem is not implemented for the fastparquet engine." + ) + path = stringify_path(path) + handles = None + if is_fsspec_url(path): + fsspec = import_optional_dependency("fsspec") + + parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs + elif isinstance(path, str) and not os.path.isdir(path): + # use get_handle only when we are very certain that it is not a directory + # fsspec resources can also point to directories + # this branch is used for example when reading from non-fsspec URLs + handles = get_handle( + path, "rb", is_text=False, storage_options=storage_options + ) + path = handles.handle + + try: + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) + return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + finally: + if handles is not None: + handles.close() + + +@doc(storage_options=_shared_docs["storage_options"]) +def to_parquet( + df: DataFrame, + path: FilePath | WriteBuffer[bytes] | None = None, + engine: str = "auto", + compression: str | None = "snappy", + index: bool | None = None, + storage_options: StorageOptions | None = None, + partition_cols: list[str] | None = None, + filesystem: Any = None, + **kwargs, +) -> bytes | None: + """ + Write a DataFrame to the parquet format. + + Parameters + ---------- + df : DataFrame + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string, it will be used as Root Directory path + when writing a partitioned dataset. The engine fastparquet does not + accept file-like objects. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + + When using the ``'pyarrow'`` engine and no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. + Use the filesystem keyword with an instantiated fsspec filesystem + if you wish to use its implementation. + compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}}, + default 'snappy'. Name of the compression to use. Use ``None`` + for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + partition_cols : str or list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + {storage_options} + + filesystem : fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. Only implemented + for ``engine="pyarrow"``. + + .. versionadded:: 2.1.0 + + kwargs + Additional keyword arguments passed to the engine + + Returns + ------- + bytes if no path argument is provided else None + """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] + impl = get_engine(engine) + + path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path + + impl.write( + df, + path_or_buf, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + filesystem=filesystem, + **kwargs, + ) + + if path is None: + assert isinstance(path_or_buf, io.BytesIO) + return path_or_buf.getvalue() + else: + return None + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_parquet( + path: FilePath | ReadBuffer[bytes], + engine: str = "auto", + columns: list[str] | None = None, + storage_options: StorageOptions | None = None, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + filesystem: Any = None, + filters: list[tuple] | list[list[tuple]] | None = None, + **kwargs, +) -> DataFrame: + """ + Load a parquet object from the file path, returning a DataFrame. + + Parameters + ---------- + path : str, path object or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. + The string could be a URL. Valid URL schemes include http, ftp, s3, + gs, and file. For file URLs, a host is expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + + When using the ``'pyarrow'`` engine and no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. + Use the filesystem keyword with an instantiated fsspec filesystem + if you wish to use its implementation. + columns : list, default=None + If not None, only these columns will be read from the file. + {storage_options} + + .. versionadded:: 1.3.0 + + use_nullable_dtypes : bool, default False + If True, use dtypes that use ``pd.NA`` as missing value indicator + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) + As new dtypes are added that support ``pd.NA`` in the future, the + output with this option will change to use those dtypes. + Note: this is an experimental option, and behaviour (e.g. additional + support dtypes) may change without notice. + + .. deprecated:: 2.0 + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + filesystem : fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. Only implemented + for ``engine="pyarrow"``. + + .. versionadded:: 2.1.0 + + filters : List[Tuple] or List[List[Tuple]], default None + To filter out data. + Filter syntax: [[(column, op, val), ...],...] + where op is [==, =, >, >=, <, <=, !=, in, not in] + The innermost tuples are transposed into a set of filters applied + through an `AND` operation. + The outer list combines these sets of filters through an `OR` + operation. + A single list of tuples can also be used, meaning that no `OR` + operation between set of filters is to be conducted. + + Using this argument will NOT result in row-wise filtering of the final + partitions unless ``engine="pyarrow"`` is also specified. For + other engines, filtering is only performed at the partition level, that is, + to prevent the loading of some row-groups and/or files. + + .. versionadded:: 2.1.0 + + **kwargs + Any additional kwargs are passed to the engine. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.to_parquet : Create a parquet object that serializes a DataFrame. + + Examples + -------- + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) + >>> original_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> df_parquet_bytes = original_df.to_parquet() + >>> from io import BytesIO + >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes)) + >>> restored_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> restored_df.equals(original_df) + True + >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"]) + >>> restored_bar + bar + 0 5 + 1 6 + 2 7 + 3 8 + 4 9 + >>> restored_bar.equals(original_df[['bar']]) + True + + The function uses `kwargs` that are passed directly to the engine. + In the following example, we use the `filters` argument of the pyarrow + engine to filter the rows of the DataFrame. + + Since `pyarrow` is the default engine, we can omit the `engine` argument. + Note that the `filters` argument is implemented by the `pyarrow` engine, + which can benefit from multithreading and also potentially be more + economical in terms of memory. + + >>> sel = [("foo", ">", 2)] + >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel) + >>> restored_part + foo bar + 0 3 8 + 1 4 9 + """ + + impl = get_engine(engine) + + if use_nullable_dtypes is not lib.no_default: + msg = ( + "The argument 'use_nullable_dtypes' is deprecated and will be removed " + "in a future version." + ) + if use_nullable_dtypes is True: + msg += ( + "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + else: + use_nullable_dtypes = False + check_dtype_backend(dtype_backend) + + return impl.read( + path, + columns=columns, + filters=filters, + storage_options=storage_options, + use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, + filesystem=filesystem, + **kwargs, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/spss.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/spss.py new file mode 100644 index 0000000000000000000000000000000000000000..db31a07df79e6de2862e57fd75de0bd4b9c2455d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/spss.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.inference import is_list_like + +from pandas.io.common import stringify_path + +if TYPE_CHECKING: + from collections.abc import Sequence + from pathlib import Path + + from pandas._typing import DtypeBackend + + from pandas import DataFrame + + +def read_spss( + path: str | Path, + usecols: Sequence[str] | None = None, + convert_categoricals: bool = True, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame: + """ + Load an SPSS file from the file path, returning a DataFrame. + + Parameters + ---------- + path : str or Path + File path. + usecols : list-like, optional + Return a subset of the columns. If None, return all columns. + convert_categoricals : bool, default is True + Convert categorical columns into pd.Categorical. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP + """ + pyreadstat = import_optional_dependency("pyreadstat") + check_dtype_backend(dtype_backend) + + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + usecols = list(usecols) # pyreadstat requires a list + + df, metadata = pyreadstat.read_sav( + stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals + ) + df.attrs = metadata.__dict__ + if dtype_backend is not lib.no_default: + df = df.convert_dtypes(dtype_backend=dtype_backend) + return df diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/stata.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/stata.py new file mode 100644 index 0000000000000000000000000000000000000000..4abf9af185a01284ab4c3ba968335117b3f9b81b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/stata.py @@ -0,0 +1,3763 @@ +""" +Module contains tools for processing Stata files into DataFrames + +The StataReader below was originally written by Joe Presbrey as part of PyDTA. +It has been extended and improved by Skipper Seabold from the Statsmodels +project who also developed the StataWriter and was finally added to pandas in +a once again improved version. + +You can find more information on http://presbrey.mit.edu/PyDTA and +https://www.statsmodels.org/devel/ +""" +from __future__ import annotations + +from collections import abc +from datetime import ( + datetime, + timedelta, +) +from io import BytesIO +import os +import struct +import sys +from typing import ( + IO, + TYPE_CHECKING, + AnyStr, + Callable, + Final, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.lib import infer_dtype +from pandas._libs.writers import max_len_string_array +from pandas.errors import ( + CategoricalConversionWarning, + InvalidColumnName, + PossiblePrecisionLoss, + ValueLabelTypeMismatch, +) +from pandas.util._decorators import ( + Appender, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import ( + ensure_object, + is_numeric_dtype, + is_string_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import ( + Categorical, + DatetimeIndex, + NaT, + Timestamp, + isna, + to_datetime, + to_timedelta, +) +from pandas.core.frame import DataFrame +from pandas.core.indexes.base import Index +from pandas.core.indexes.range import RangeIndex +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import get_handle + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + from types import TracebackType + from typing import Literal + + from pandas._typing import ( + CompressionOptions, + FilePath, + ReadBuffer, + Self, + StorageOptions, + WriteBuffer, + ) + +_version_error = ( + "Version of given Stata file is {version}. pandas supports importing " + "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," + "and 119 (Stata 15/16, over 32,767 variables)." +) + +_statafile_processing_params1 = """\ +convert_dates : bool, default True + Convert date variables to DataFrame time values. +convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables.""" + +_statafile_processing_params2 = """\ +index_col : str, optional + Column to set as index. +convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. +preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). +columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. +order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered.""" + +_chunksize_params = """\ +chunksize : int, default None + Return StataReader object for iterations, returns chunks with + given number of lines.""" + +_iterator_params = """\ +iterator : bool, default False + Return StataReader object.""" + +_reader_notes = """\ +Notes +----- +Categorical variables read through an iterator may not have the same +categories and dtype. This occurs when a variable stored in a DTA +file is associated to an incomplete set of value labels that only +label a strict subset of the values.""" + +_read_stata_doc = f""" +Read Stata file into DataFrame. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.dta``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} +{_shared_docs["decompression_options"] % "filepath_or_buffer"} +{_shared_docs["storage_options"]} + +Returns +------- +DataFrame or pandas.api.typing.StataReader + +See Also +-------- +io.stata.StataReader : Low-level reader for Stata data files. +DataFrame.to_stata: Export Stata data files. + +{_reader_notes} + +Examples +-------- + +Creating a dummy stata for this example + +>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'], +... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP +>>> df.to_stata('animals.dta') # doctest: +SKIP + +Read a Stata dta file: + +>>> df = pd.read_stata('animals.dta') # doctest: +SKIP + +Read a Stata dta file in 10,000 line chunks: + +>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP +>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP +>>> df.to_stata('filename.dta') # doctest: +SKIP + +>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP +>>> for chunk in itr: +... # Operate on a single chunk, e.g., chunk.mean() +... pass # doctest: +SKIP +""" + +_read_method_doc = f"""\ +Reads observations from Stata file, converting them into a dataframe + +Parameters +---------- +nrows : int + Number of lines to read from data file, if None read whole file. +{_statafile_processing_params1} +{_statafile_processing_params2} + +Returns +------- +DataFrame +""" + +_stata_reader_doc = f"""\ +Class for reading Stata dta files. + +Parameters +---------- +path_or_buf : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or object + implementing a binary read() functions. +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_shared_docs["decompression_options"]} +{_shared_docs["storage_options"]} + +{_reader_notes} +""" + + +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] + + +stata_epoch: Final = datetime(1960, 1, 1) + + +def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: + """ + Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : Series + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + Returns + + Returns + ------- + converted : Series + The converted dates + + Examples + -------- + >>> dates = pd.Series([52]) + >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") + 0 1961-01-01 + dtype: datetime64[ns] + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + """ + MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year + MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days + MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days + MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 + MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 + + def convert_year_month_safe(year, month) -> Series: + """ + Convert year and month to datetimes, using pandas vectorized versions + when the date range falls within the range supported by pandas. + Otherwise it falls back to a slower but more robust method + using datetime. + """ + if year.max() < MAX_YEAR and year.min() > MIN_YEAR: + return to_datetime(100 * year + month, format="%Y%m") + else: + index = getattr(year, "index", None) + return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index) + + def convert_year_days_safe(year, days) -> Series: + """ + Converts year (e.g. 1999) and days since the start of the year to a + datetime or datetime64 Series + """ + if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: + return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") + else: + index = getattr(year, "index", None) + value = [ + datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days) + ] + return Series(value, index=index) + + def convert_delta_safe(base, deltas, unit) -> Series: + """ + Convert base dates and deltas to datetimes, using pandas vectorized + versions if the deltas satisfy restrictions required to be expressed + as dates in pandas. + """ + index = getattr(deltas, "index", None) + if unit == "d": + if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: + values = [base + timedelta(days=int(d)) for d in deltas] + return Series(values, index=index) + elif unit == "ms": + if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: + values = [ + base + timedelta(microseconds=(int(d) * 1000)) for d in deltas + ] + return Series(values, index=index) + else: + raise ValueError("format not understood") + base = to_datetime(base) + deltas = to_timedelta(deltas, unit=unit) + return base + deltas + + # TODO(non-nano): If/when pandas supports more than datetime64[ns], this + # should be improved to use correct range, e.g. datetime[Y] for yearly + bad_locs = np.isnan(dates) + has_bad_values = False + if bad_locs.any(): + has_bad_values = True + dates._values[bad_locs] = 1.0 # Replace with NaT + dates = dates.astype(np.int64) + + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base + base = stata_epoch + ms = dates + conv_dates = convert_delta_safe(base, ms, "ms") + elif fmt.startswith(("%tC", "tC")): + warnings.warn( + "Encountered %tC format. Leaving in Stata Internal Format.", + stacklevel=find_stack_level(), + ) + conv_dates = Series(dates, dtype=object) + if has_bad_values: + conv_dates[bad_locs] = NaT + return conv_dates + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): + base = stata_epoch + days = dates + conv_dates = convert_delta_safe(base, days, "d") + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): + year = stata_epoch.year + dates // 52 + days = (dates % 52) * 7 + conv_dates = convert_year_days_safe(year, days) + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base + year = stata_epoch.year + dates // 12 + month = (dates % 12) + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base + year = stata_epoch.year + dates // 4 + quarter_month = (dates % 4) * 3 + 1 + conv_dates = convert_year_month_safe(year, quarter_month) + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base + year = stata_epoch.year + dates // 2 + month = (dates % 2) * 6 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt.startswith(("%ty", "ty")): # Years -- not delta + year = dates + first_month = np.ones_like(dates) + conv_dates = convert_year_month_safe(year, first_month) + else: + raise ValueError(f"Date fmt {fmt} not understood") + + if has_bad_values: # Restore NaT for bad values + conv_dates[bad_locs] = NaT + + return conv_dates + + +def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: + """ + Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : Series + Series or array containing datetime or datetime64[ns] to + convert to the Stata Internal Format given by fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + """ + index = dates.index + NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 + US_PER_DAY = NS_PER_DAY / 1000 + + def parse_dates_safe( + dates: Series, delta: bool = False, year: bool = False, days: bool = False + ): + d = {} + if lib.is_np_dtype(dates.dtype, "M"): + if delta: + time_delta = dates - Timestamp(stata_epoch).as_unit("ns") + d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds + if days or year: + date_index = DatetimeIndex(dates) + d["year"] = date_index._data.year + d["month"] = date_index._data.month + if days: + days_in_ns = dates._values.view(np.int64) - to_datetime( + d["year"], format="%Y" + )._values.view(np.int64) + d["days"] = days_in_ns // NS_PER_DAY + + elif infer_dtype(dates, skipna=False) == "datetime": + if delta: + delta = dates._values - stata_epoch + + def f(x: timedelta) -> float: + return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds + + v = np.vectorize(f) + d["delta"] = v(delta) + if year: + year_month = dates.apply(lambda x: 100 * x.year + x.month) + d["year"] = year_month._values // 100 + d["month"] = year_month._values - d["year"] * 100 + if days: + + def g(x: datetime) -> int: + return (x - datetime(x.year, 1, 1)).days + + v = np.vectorize(g) + d["days"] = v(dates) + else: + raise ValueError( + "Columns containing dates must contain either " + "datetime64, datetime or null values." + ) + + return DataFrame(d, index=index) + + bad_loc = isna(dates) + index = dates.index + if bad_loc.any(): + if lib.is_np_dtype(dates.dtype, "M"): + dates._values[bad_loc] = to_datetime(stata_epoch) + else: + dates._values[bad_loc] = stata_epoch + + if fmt in ["%tc", "tc"]: + d = parse_dates_safe(dates, delta=True) + conv_dates = d.delta / 1000 + elif fmt in ["%tC", "tC"]: + warnings.warn( + "Stata Internal Format tC not supported.", + stacklevel=find_stack_level(), + ) + conv_dates = dates + elif fmt in ["%td", "td"]: + d = parse_dates_safe(dates, delta=True) + conv_dates = d.delta // US_PER_DAY + elif fmt in ["%tw", "tw"]: + d = parse_dates_safe(dates, year=True, days=True) + conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 + elif fmt in ["%tm", "tm"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 + elif fmt in ["%tq", "tq"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 + elif fmt in ["%th", "th"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int) + elif fmt in ["%ty", "ty"]: + d = parse_dates_safe(dates, year=True) + conv_dates = d.year + else: + raise ValueError(f"Format {fmt} is not a known Stata date format") + + conv_dates = Series(conv_dates, dtype=np.float64, copy=False) + missing_value = struct.unpack(" DataFrame: + """ + Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns in Stata must be one of int8, int16, int32, float32 or + float64, with some additional value restrictions. int8 and int16 columns + are checked for violations of the value restrictions and upcast if needed. + int64 data is not usable in Stata, and so it is downcast to int32 whenever + the value are in the int32 range, and sidecast to float64 when larger than + this range. If the int64 values are outside of the range of those + perfectly representable as float64 values, a warning is raised. + + bool columns are cast to int8. uint columns are converted to int of the + same size if there is no loss in precision, otherwise are upcast to a + larger type. uint64 is currently not supported since it is concerted to + object in a DataFrame. + """ + ws = "" + # original, if small, if large + conversion_data: tuple[ + tuple[type, type, type], + tuple[type, type, type], + tuple[type, type, type], + tuple[type, type, type], + tuple[type, type, type], + ] = ( + (np.bool_, np.int8, np.int8), + (np.uint8, np.int8, np.int16), + (np.uint16, np.int16, np.int32), + (np.uint32, np.int32, np.int64), + (np.uint64, np.int64, np.float64), + ) + + float32_max = struct.unpack("= 2**53: + ws = precision_loss_doc.format("uint64", "float64") + + data[col] = data[col].astype(dtype) + + # Check values and upcast if necessary + + if dtype == np.int8 and not empty_df: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16 and not empty_df: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if empty_df or ( + data[col].max() <= 2147483620 and data[col].min() >= -2147483647 + ): + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() >= 2**53 or data[col].min() <= -(2**53): + ws = precision_loss_doc.format("int64", "float64") + elif dtype in (np.float32, np.float64): + if np.isinf(data[col]).any(): + raise ValueError( + f"Column {col} contains infinity or -infinity" + "which is outside the range supported by Stata." + ) + value = data[col].max() + if dtype == np.float32 and value > float32_max: + data[col] = data[col].astype(np.float64) + elif dtype == np.float64: + if value > float64_max: + raise ValueError( + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" + ) + if is_nullable_int: + if orig_missing.any(): + # Replace missing by Stata sentinel value + sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name] + data.loc[orig_missing, col] = sentinel + if ws: + warnings.warn( + ws, + PossiblePrecisionLoss, + stacklevel=find_stack_level(), + ) + + return data + + +class StataValueLabel: + """ + Parse a categorical column and prepare formatted output + + Parameters + ---------- + catarray : Series + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. + """ + + def __init__( + self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1" + ) -> None: + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") + self.labname = catarray.name + self._encoding = encoding + categories = catarray.cat.categories + self.value_labels = enumerate(categories) + + self._prepare_value_labels() + + def _prepare_value_labels(self) -> None: + """Encode value labels.""" + + self.text_len = 0 + self.txt: list[bytes] = [] + self.n = 0 + # Offsets (length of categories), converted to int32 + self.off = np.array([], dtype=np.int32) + # Values, converted to int32 + self.val = np.array([], dtype=np.int32) + self.len = 0 + + # Compute lengths and setup lists of offsets and labels + offsets: list[int] = [] + values: list[float] = [] + for vl in self.value_labels: + category: str | bytes = vl[1] + if not isinstance(category, str): + category = str(category) + warnings.warn( + value_label_mismatch_doc.format(self.labname), + ValueLabelTypeMismatch, + stacklevel=find_stack_level(), + ) + category = category.encode(self._encoding) + offsets.append(self.text_len) + self.text_len += len(category) + 1 # +1 for the padding + values.append(vl[0]) + self.txt.append(category) + self.n += 1 + + if self.text_len > 32000: + raise ValueError( + "Stata value labels for a single variable must " + "have a combined length less than 32,000 characters." + ) + + # Ensure int32 + self.off = np.array(offsets, dtype=np.int32) + self.val = np.array(values, dtype=np.int32) + + # Total length + self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len + + def generate_value_label(self, byteorder: str) -> bytes: + """ + Generate the binary representation of the value labels. + + Parameters + ---------- + byteorder : str + Byte order of the output + + Returns + ------- + value_label : bytes + Bytes containing the formatted value label + """ + encoding = self._encoding + bio = BytesIO() + null_byte = b"\x00" + + # len + bio.write(struct.pack(byteorder + "i", self.len)) + + # labname + labname = str(self.labname)[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) + bio.write(labname) + + # padding - 3 bytes + for i in range(3): + bio.write(struct.pack("c", null_byte)) + + # value_label_table + # n - int32 + bio.write(struct.pack(byteorder + "i", self.n)) + + # textlen - int32 + bio.write(struct.pack(byteorder + "i", self.text_len)) + + # off - int32 array (n elements) + for offset in self.off: + bio.write(struct.pack(byteorder + "i", offset)) + + # val - int32 array (n elements) + for value in self.val: + bio.write(struct.pack(byteorder + "i", value)) + + # txt - Text labels, null terminated + for text in self.txt: + bio.write(text + null_byte) + + return bio.getvalue() + + +class StataNonCatValueLabel(StataValueLabel): + """ + Prepare formatted version of value labels + + Parameters + ---------- + labname : str + Value label name + value_labels: Dictionary + Mapping of values to labels + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. + """ + + def __init__( + self, + labname: str, + value_labels: dict[float, str], + encoding: Literal["latin-1", "utf-8"] = "latin-1", + ) -> None: + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") + + self.labname = labname + self._encoding = encoding + self.value_labels = sorted( # type: ignore[assignment] + value_labels.items(), key=lambda x: x[0] + ) + self._prepare_value_labels() + + +class StataMissingValue: + """ + An observation's missing value. + + Parameters + ---------- + value : {int, float} + The Stata missing value code + + Notes + ----- + More information: + + Integer missing values make the code '.', '.a', ..., '.z' to the ranges + 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... + 2147483647 (for int32). Missing values for floating point data types are + more complex but the pattern is simple to discern from the following table. + + np.float32 missing values (float in Stata) + 0000007f . + 0008007f .a + 0010007f .b + ... + 00c0007f .x + 00c8007f .y + 00d0007f .z + + np.float64 missing values (double in Stata) + 000000000000e07f . + 000000000001e07f .a + 000000000002e07f .b + ... + 000000000018e07f .x + 000000000019e07f .y + 00000000001ae07f .z + """ + + # Construct a dictionary of missing values + MISSING_VALUES: dict[float, str] = {} + bases: Final = (101, 32741, 2147483621) + for b in bases: + # Conversion to long to avoid hash issues on 32 bit platforms #8968 + MISSING_VALUES[b] = "." + for i in range(1, 27): + MISSING_VALUES[i + b] = "." + chr(96 + i) + + float32_base: bytes = b"\x00\x00\x00\x7f" + increment_32: int = struct.unpack(" 0: + MISSING_VALUES[key] += chr(96 + i) + int_value = struct.unpack(" 0: + MISSING_VALUES[key] += chr(96 + i) + int_value = struct.unpack("q", struct.pack(" None: + self._value = value + # Conversion to int to avoid hash issues on 32 bit platforms #8968 + value = int(value) if value < 2147483648 else float(value) + self._str = self.MISSING_VALUES[value] + + @property + def string(self) -> str: + """ + The Stata representation of the missing value: '.', '.a'..'.z' + + Returns + ------- + str + The representation of the missing value. + """ + return self._str + + @property + def value(self) -> float: + """ + The binary representation of the missing value. + + Returns + ------- + {int, float} + The binary representation of the missing value. + """ + return self._value + + def __str__(self) -> str: + return self.string + + def __repr__(self) -> str: + return f"{type(self)}({self})" + + def __eq__(self, other: object) -> bool: + return ( + isinstance(other, type(self)) + and self.string == other.string + and self.value == other.value + ) + + @classmethod + def get_base_missing_value(cls, dtype: np.dtype) -> float: + if dtype.type is np.int8: + value = cls.BASE_MISSING_VALUES["int8"] + elif dtype.type is np.int16: + value = cls.BASE_MISSING_VALUES["int16"] + elif dtype.type is np.int32: + value = cls.BASE_MISSING_VALUES["int32"] + elif dtype.type is np.float32: + value = cls.BASE_MISSING_VALUES["float32"] + elif dtype.type is np.float64: + value = cls.BASE_MISSING_VALUES["float64"] + else: + raise ValueError("Unsupported dtype") + return value + + +class StataParser: + def __init__(self) -> None: + # type code. + # -------------------- + # str1 1 = 0x01 + # str2 2 = 0x02 + # ... + # str244 244 = 0xf4 + # byte 251 = 0xfb (sic) + # int 252 = 0xfc + # long 253 = 0xfd + # float 254 = 0xfe + # double 255 = 0xff + # -------------------- + # NOTE: the byte type seems to be reserved for categorical variables + # with a label, but the underlying variable is -127 to 100 + # we're going to drop the label and cast to int + self.DTYPE_MAP = dict( + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + + [ + (251, np.dtype(np.int8)), + (252, np.dtype(np.int16)), + (253, np.dtype(np.int32)), + (254, np.dtype(np.float32)), + (255, np.dtype(np.float64)), + ] + ) + self.DTYPE_MAP_XML: dict[int, np.dtype] = { + 32768: np.dtype(np.uint8), # Keys to GSO + 65526: np.dtype(np.float64), + 65527: np.dtype(np.float32), + 65528: np.dtype(np.int32), + 65529: np.dtype(np.int16), + 65530: np.dtype(np.int8), + } + self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd")) + self.TYPE_MAP_XML = { + # Not really a Q, unclear how to handle byteswap + 32768: "Q", + 65526: "d", + 65527: "f", + 65528: "l", + 65529: "h", + 65530: "b", + } + # NOTE: technically, some of these are wrong. there are more numbers + # that can be represented. it's the 27 ABOVE and BELOW the max listed + # numeric data type in [U] 12.2.2 of the 11.2 manual + float32_min = b"\xff\xff\xff\xfe" + float32_max = b"\xff\xff\xff\x7e" + float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff" + float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f" + self.VALID_RANGE = { + "b": (-127, 100), + "h": (-32767, 32740), + "l": (-2147483647, 2147483620), + "f": ( + np.float32(struct.unpack(" None: + super().__init__() + + # Arguments to the reader (can be temporarily overridden in + # calls to read). + self._convert_dates = convert_dates + self._convert_categoricals = convert_categoricals + self._index_col = index_col + self._convert_missing = convert_missing + self._preserve_dtypes = preserve_dtypes + self._columns = columns + self._order_categoricals = order_categoricals + self._original_path_or_buf = path_or_buf + self._compression = compression + self._storage_options = storage_options + self._encoding = "" + self._chunksize = chunksize + self._using_iterator = False + self._entered = False + if self._chunksize is None: + self._chunksize = 1 + elif not isinstance(chunksize, int) or chunksize <= 0: + raise ValueError("chunksize must be a positive integer when set.") + + # State variables for the file + self._close_file: Callable[[], None] | None = None + self._missing_values = False + self._can_read_value_labels = False + self._column_selector_set = False + self._value_labels_read = False + self._data_read = False + self._dtype: np.dtype | None = None + self._lines_read = 0 + + self._native_byteorder = _set_endianness(sys.byteorder) + + def _ensure_open(self) -> None: + """ + Ensure the file has been opened and its header data read. + """ + if not hasattr(self, "_path_or_buf"): + self._open_file() + + def _open_file(self) -> None: + """ + Open the file (with compression options, etc.), and read header information. + """ + if not self._entered: + warnings.warn( + "StataReader is being used without using a context manager. " + "Using StataReader as a context manager is the only supported method.", + ResourceWarning, + stacklevel=find_stack_level(), + ) + handles = get_handle( + self._original_path_or_buf, + "rb", + storage_options=self._storage_options, + is_text=False, + compression=self._compression, + ) + if hasattr(handles.handle, "seekable") and handles.handle.seekable(): + # If the handle is directly seekable, use it without an extra copy. + self._path_or_buf = handles.handle + self._close_file = handles.close + else: + # Copy to memory, and ensure no encoding. + with handles: + self._path_or_buf = BytesIO(handles.handle.read()) + self._close_file = self._path_or_buf.close + + self._read_header() + self._setup_dtype() + + def __enter__(self) -> Self: + """enter context manager""" + self._entered = True + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + if self._close_file: + self._close_file() + + def close(self) -> None: + """Close the handle if its open. + + .. deprecated: 2.0.0 + + The close method is not part of the public API. + The only supported way to use StataReader is to use it as a context manager. + """ + warnings.warn( + "The StataReader.close() method is not part of the public API and " + "will be removed in a future version without notice. " + "Using StataReader as a context manager is the only supported method.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if self._close_file: + self._close_file() + + def _set_encoding(self) -> None: + """ + Set string encoding which depends on file version + """ + if self._format_version < 118: + self._encoding = "latin-1" + else: + self._encoding = "utf-8" + + def _read_int8(self) -> int: + return struct.unpack("b", self._path_or_buf.read(1))[0] + + def _read_uint8(self) -> int: + return struct.unpack("B", self._path_or_buf.read(1))[0] + + def _read_uint16(self) -> int: + return struct.unpack(f"{self._byteorder}H", self._path_or_buf.read(2))[0] + + def _read_uint32(self) -> int: + return struct.unpack(f"{self._byteorder}I", self._path_or_buf.read(4))[0] + + def _read_uint64(self) -> int: + return struct.unpack(f"{self._byteorder}Q", self._path_or_buf.read(8))[0] + + def _read_int16(self) -> int: + return struct.unpack(f"{self._byteorder}h", self._path_or_buf.read(2))[0] + + def _read_int32(self) -> int: + return struct.unpack(f"{self._byteorder}i", self._path_or_buf.read(4))[0] + + def _read_int64(self) -> int: + return struct.unpack(f"{self._byteorder}q", self._path_or_buf.read(8))[0] + + def _read_char8(self) -> bytes: + return struct.unpack("c", self._path_or_buf.read(1))[0] + + def _read_int16_count(self, count: int) -> tuple[int, ...]: + return struct.unpack( + f"{self._byteorder}{'h' * count}", + self._path_or_buf.read(2 * count), + ) + + def _read_header(self) -> None: + first_char = self._read_char8() + if first_char == b"<": + self._read_new_header() + else: + self._read_old_header(first_char) + + def _read_new_header(self) -> None: + # The first part of the header is common to 117 - 119. + self._path_or_buf.read(27) # stata_dta>
+ self._format_version = int(self._path_or_buf.read(3)) + if self._format_version not in [117, 118, 119]: + raise ValueError(_version_error.format(version=self._format_version)) + self._set_encoding() + self._path_or_buf.read(21) # + self._byteorder = ">" if self._path_or_buf.read(3) == b"MSF" else "<" + self._path_or_buf.read(15) # + self._nvar = ( + self._read_uint16() if self._format_version <= 118 else self._read_uint32() + ) + self._path_or_buf.read(7) # + + self._nobs = self._get_nobs() + self._path_or_buf.read(11) # + self._time_stamp = self._get_time_stamp() + self._path_or_buf.read(26) #
+ self._path_or_buf.read(8) # 0x0000000000000000 + self._path_or_buf.read(8) # position of + + self._seek_vartypes = self._read_int64() + 16 + self._seek_varnames = self._read_int64() + 10 + self._seek_sortlist = self._read_int64() + 10 + self._seek_formats = self._read_int64() + 9 + self._seek_value_label_names = self._read_int64() + 19 + + # Requires version-specific treatment + self._seek_variable_labels = self._get_seek_variable_labels() + + self._path_or_buf.read(8) # + self._data_location = self._read_int64() + 6 + self._seek_strls = self._read_int64() + 7 + self._seek_value_labels = self._read_int64() + 14 + + self._typlist, self._dtyplist = self._get_dtypes(self._seek_vartypes) + + self._path_or_buf.seek(self._seek_varnames) + self._varlist = self._get_varlist() + + self._path_or_buf.seek(self._seek_sortlist) + self._srtlist = self._read_int16_count(self._nvar + 1)[:-1] + + self._path_or_buf.seek(self._seek_formats) + self._fmtlist = self._get_fmtlist() + + self._path_or_buf.seek(self._seek_value_label_names) + self._lbllist = self._get_lbllist() + + self._path_or_buf.seek(self._seek_variable_labels) + self._variable_labels = self._get_variable_labels() + + # Get data type information, works for versions 117-119. + def _get_dtypes( + self, seek_vartypes: int + ) -> tuple[list[int | str], list[str | np.dtype]]: + self._path_or_buf.seek(seek_vartypes) + typlist = [] + dtyplist = [] + for _ in range(self._nvar): + typ = self._read_uint16() + if typ <= 2045: + typlist.append(typ) + dtyplist.append(str(typ)) + else: + try: + typlist.append(self.TYPE_MAP_XML[typ]) # type: ignore[arg-type] + dtyplist.append(self.DTYPE_MAP_XML[typ]) # type: ignore[arg-type] + except KeyError as err: + raise ValueError(f"cannot convert stata types [{typ}]") from err + + return typlist, dtyplist # type: ignore[return-value] + + def _get_varlist(self) -> list[str]: + # 33 in order formats, 129 in formats 118 and 119 + b = 33 if self._format_version < 118 else 129 + return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)] + + # Returns the format list + def _get_fmtlist(self) -> list[str]: + if self._format_version >= 118: + b = 57 + elif self._format_version > 113: + b = 49 + elif self._format_version > 104: + b = 12 + else: + b = 7 + + return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)] + + # Returns the label list + def _get_lbllist(self) -> list[str]: + if self._format_version >= 118: + b = 129 + elif self._format_version > 108: + b = 33 + else: + b = 9 + return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)] + + def _get_variable_labels(self) -> list[str]: + if self._format_version >= 118: + vlblist = [ + self._decode(self._path_or_buf.read(321)) for _ in range(self._nvar) + ] + elif self._format_version > 105: + vlblist = [ + self._decode(self._path_or_buf.read(81)) for _ in range(self._nvar) + ] + else: + vlblist = [ + self._decode(self._path_or_buf.read(32)) for _ in range(self._nvar) + ] + return vlblist + + def _get_nobs(self) -> int: + if self._format_version >= 118: + return self._read_uint64() + else: + return self._read_uint32() + + def _get_data_label(self) -> str: + if self._format_version >= 118: + strlen = self._read_uint16() + return self._decode(self._path_or_buf.read(strlen)) + elif self._format_version == 117: + strlen = self._read_int8() + return self._decode(self._path_or_buf.read(strlen)) + elif self._format_version > 105: + return self._decode(self._path_or_buf.read(81)) + else: + return self._decode(self._path_or_buf.read(32)) + + def _get_time_stamp(self) -> str: + if self._format_version >= 118: + strlen = self._read_int8() + return self._path_or_buf.read(strlen).decode("utf-8") + elif self._format_version == 117: + strlen = self._read_int8() + return self._decode(self._path_or_buf.read(strlen)) + elif self._format_version > 104: + return self._decode(self._path_or_buf.read(18)) + else: + raise ValueError() + + def _get_seek_variable_labels(self) -> int: + if self._format_version == 117: + self._path_or_buf.read(8) # , throw away + # Stata 117 data files do not follow the described format. This is + # a work around that uses the previous label, 33 bytes for each + # variable, 20 for the closing tag and 17 for the opening tag + return self._seek_value_label_names + (33 * self._nvar) + 20 + 17 + elif self._format_version >= 118: + return self._read_int64() + 17 + else: + raise ValueError() + + def _read_old_header(self, first_char: bytes) -> None: + self._format_version = int(first_char[0]) + if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + raise ValueError(_version_error.format(version=self._format_version)) + self._set_encoding() + self._byteorder = ">" if self._read_int8() == 0x1 else "<" + self._filetype = self._read_int8() + self._path_or_buf.read(1) # unused + + self._nvar = self._read_uint16() + self._nobs = self._get_nobs() + + self._data_label = self._get_data_label() + + self._time_stamp = self._get_time_stamp() + + # descriptors + if self._format_version > 108: + typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] + else: + buf = self._path_or_buf.read(self._nvar) + typlistb = np.frombuffer(buf, dtype=np.uint8) + typlist = [] + for tp in typlistb: + if tp in self.OLD_TYPE_MAPPING: + typlist.append(self.OLD_TYPE_MAPPING[tp]) + else: + typlist.append(tp - 127) # bytes + + try: + self._typlist = [self.TYPE_MAP[typ] for typ in typlist] + except ValueError as err: + invalid_types = ",".join([str(x) for x in typlist]) + raise ValueError(f"cannot convert stata types [{invalid_types}]") from err + try: + self._dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] + except ValueError as err: + invalid_dtypes = ",".join([str(x) for x in typlist]) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err + + if self._format_version > 108: + self._varlist = [ + self._decode(self._path_or_buf.read(33)) for _ in range(self._nvar) + ] + else: + self._varlist = [ + self._decode(self._path_or_buf.read(9)) for _ in range(self._nvar) + ] + self._srtlist = self._read_int16_count(self._nvar + 1)[:-1] + + self._fmtlist = self._get_fmtlist() + + self._lbllist = self._get_lbllist() + + self._variable_labels = self._get_variable_labels() + + # ignore expansion fields (Format 105 and later) + # When reading, read five bytes; the last four bytes now tell you + # the size of the next read, which you discard. You then continue + # like this until you read 5 bytes of zeros. + + if self._format_version > 104: + while True: + data_type = self._read_int8() + if self._format_version > 108: + data_len = self._read_int32() + else: + data_len = self._read_int16() + if data_type == 0: + break + self._path_or_buf.read(data_len) + + # necessary data to continue parsing + self._data_location = self._path_or_buf.tell() + + def _setup_dtype(self) -> np.dtype: + """Map between numpy and state dtypes""" + if self._dtype is not None: + return self._dtype + + dtypes = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self._typlist): + if typ in self.NUMPY_TYPE_MAP: + typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP + dtypes.append((f"s{i}", f"{self._byteorder}{self.NUMPY_TYPE_MAP[typ]}")) + else: + dtypes.append((f"s{i}", f"S{typ}")) + self._dtype = np.dtype(dtypes) + + return self._dtype + + def _decode(self, s: bytes) -> str: + # have bytes not strings, so must decode + s = s.partition(b"\0")[0] + try: + return s.decode(self._encoding) + except UnicodeDecodeError: + # GH 25960, fallback to handle incorrect format produced when 117 + # files are converted to 118 files in Stata + encoding = self._encoding + msg = f""" +One or more strings in the dta file could not be decoded using {encoding}, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + warnings.warn( + msg, + UnicodeWarning, + stacklevel=find_stack_level(), + ) + return s.decode("latin-1") + + def _read_value_labels(self) -> None: + self._ensure_open() + if self._value_labels_read: + # Don't read twice + return + if self._format_version <= 108: + # Value labels are not supported in version 108 and earlier. + self._value_labels_read = True + self._value_label_dict: dict[str, dict[float, str]] = {} + return + + if self._format_version >= 117: + self._path_or_buf.seek(self._seek_value_labels) + else: + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + + self._value_labels_read = True + self._value_label_dict = {} + + while True: + if self._format_version >= 117: + if self._path_or_buf.read(5) == b" + break # end of value label table + + slength = self._path_or_buf.read(4) + if not slength: + break # end of value label table (format < 117) + if self._format_version <= 117: + labname = self._decode(self._path_or_buf.read(33)) + else: + labname = self._decode(self._path_or_buf.read(129)) + self._path_or_buf.read(3) # padding + + n = self._read_uint32() + txtlen = self._read_uint32() + off = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n + ) + val = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n + ) + ii = np.argsort(off) + off = off[ii] + val = val[ii] + txt = self._path_or_buf.read(txtlen) + self._value_label_dict[labname] = {} + for i in range(n): + end = off[i + 1] if i < n - 1 else txtlen + self._value_label_dict[labname][val[i]] = self._decode( + txt[off[i] : end] + ) + if self._format_version >= 117: + self._path_or_buf.read(6) # + self._value_labels_read = True + + def _read_strls(self) -> None: + self._path_or_buf.seek(self._seek_strls) + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO = {"0": ""} + while True: + if self._path_or_buf.read(3) != b"GSO": + break + + if self._format_version == 117: + v_o = self._read_uint64() + else: + buf = self._path_or_buf.read(12) + # Only tested on little endian file on little endian machine. + v_size = 2 if self._format_version == 118 else 3 + if self._byteorder == "<": + buf = buf[0:v_size] + buf[4 : (12 - v_size)] + else: + # This path may not be correct, impossible to test + buf = buf[0:v_size] + buf[(4 + v_size) :] + v_o = struct.unpack("Q", buf)[0] + typ = self._read_uint8() + length = self._read_uint32() + va = self._path_or_buf.read(length) + if typ == 130: + decoded_va = va[0:-1].decode(self._encoding) + else: + # Stata says typ 129 can be binary, so use str + decoded_va = str(va) + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO[str(v_o)] = decoded_va + + def __next__(self) -> DataFrame: + self._using_iterator = True + return self.read(nrows=self._chunksize) + + def get_chunk(self, size: int | None = None) -> DataFrame: + """ + Reads lines from Stata file and returns as dataframe + + Parameters + ---------- + size : int, defaults to None + Number of lines to read. If None, reads whole file. + + Returns + ------- + DataFrame + """ + if size is None: + size = self._chunksize + return self.read(nrows=size) + + @Appender(_read_method_doc) + def read( + self, + nrows: int | None = None, + convert_dates: bool | None = None, + convert_categoricals: bool | None = None, + index_col: str | None = None, + convert_missing: bool | None = None, + preserve_dtypes: bool | None = None, + columns: Sequence[str] | None = None, + order_categoricals: bool | None = None, + ) -> DataFrame: + self._ensure_open() + + # Handle options + if convert_dates is None: + convert_dates = self._convert_dates + if convert_categoricals is None: + convert_categoricals = self._convert_categoricals + if convert_missing is None: + convert_missing = self._convert_missing + if preserve_dtypes is None: + preserve_dtypes = self._preserve_dtypes + if columns is None: + columns = self._columns + if order_categoricals is None: + order_categoricals = self._order_categoricals + if index_col is None: + index_col = self._index_col + if nrows is None: + nrows = self._nobs + + # Handle empty file or chunk. If reading incrementally raise + # StopIteration. If reading the whole thing return an empty + # data frame. + if (self._nobs == 0) and nrows == 0: + self._can_read_value_labels = True + self._data_read = True + data = DataFrame(columns=self._varlist) + # Apply dtypes correctly + for i, col in enumerate(data.columns): + dt = self._dtyplist[i] + if isinstance(dt, np.dtype): + if dt.char != "S": + data[col] = data[col].astype(dt) + if columns is not None: + data = self._do_select_columns(data, columns) + return data + + if (self._format_version >= 117) and (not self._value_labels_read): + self._can_read_value_labels = True + self._read_strls() + + # Read data + assert self._dtype is not None + dtype = self._dtype + max_read_len = (self._nobs - self._lines_read) * dtype.itemsize + read_len = nrows * dtype.itemsize + read_len = min(read_len, max_read_len) + if read_len <= 0: + # Iterator has finished, should never be here unless + # we are reading the file incrementally + if convert_categoricals: + self._read_value_labels() + raise StopIteration + offset = self._lines_read * dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + read_lines = min(nrows, self._nobs - self._lines_read) + raw_data = np.frombuffer( + self._path_or_buf.read(read_len), dtype=dtype, count=read_lines + ) + + self._lines_read += read_lines + if self._lines_read == self._nobs: + self._can_read_value_labels = True + self._data_read = True + # if necessary, swap the byte order to native here + if self._byteorder != self._native_byteorder: + raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) + + if convert_categoricals: + self._read_value_labels() + + if len(raw_data) == 0: + data = DataFrame(columns=self._varlist) + else: + data = DataFrame.from_records(raw_data) + data.columns = Index(self._varlist) + + # If index is not specified, use actual row number rather than + # restarting at 0 for each chunk. + if index_col is None: + data.index = RangeIndex( + self._lines_read - read_lines, self._lines_read + ) # set attr instead of set_index to avoid copy + + if columns is not None: + data = self._do_select_columns(data, columns) + + # Decode strings + for col, typ in zip(data, self._typlist): + if isinstance(typ, int): + data[col] = data[col].apply(self._decode) + + data = self._insert_strls(data) + + # Convert columns (if needed) to match input type + valid_dtypes = [i for i, dtyp in enumerate(self._dtyplist) if dtyp is not None] + object_type = np.dtype(object) + for idx in valid_dtypes: + dtype = data.iloc[:, idx].dtype + if dtype not in (object_type, self._dtyplist[idx]): + data.isetitem(idx, data.iloc[:, idx].astype(dtype)) + + data = self._do_convert_missing(data, convert_missing) + + if convert_dates: + for i, fmt in enumerate(self._fmtlist): + if any(fmt.startswith(date_fmt) for date_fmt in _date_formats): + data.isetitem( + i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) + ) + + if convert_categoricals and self._format_version > 108: + data = self._do_convert_categoricals( + data, self._value_label_dict, self._lbllist, order_categoricals + ) + + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.dtype(np.float16), np.dtype(np.float32)): + dtype = np.dtype(np.float64) + convert = True + elif dtype in ( + np.dtype(np.int8), + np.dtype(np.int16), + np.dtype(np.int32), + ): + dtype = np.dtype(np.int64) + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_dict(dict(retyped_data)) + + if index_col is not None: + data = data.set_index(data.pop(index_col)) + + return data + + def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # Check for missing values, and replace if found + replacements = {} + for i in range(len(data.columns)): + fmt = self._typlist[i] + if fmt not in self.VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] + series = data.iloc[:, i] + + # appreciably faster to do this with ndarray instead of Series + svals = series._values + missing = (svals < nmin) | (svals > nmax) + + if not missing.any(): + continue + + if convert_missing: # Replacement follows Stata notation + missing_loc = np.nonzero(np.asarray(missing))[0] + umissing, umissing_loc = np.unique(series[missing], return_inverse=True) + replacement = Series(series, dtype=object) + for j, um in enumerate(umissing): + missing_value = StataMissingValue(um) + + loc = missing_loc[umissing_loc == j] + replacement.iloc[loc] = missing_value + else: # All replacements are identical + dtype = series.dtype + if dtype not in (np.float32, np.float64): + dtype = np.float64 + replacement = Series(series, dtype=dtype) + if not replacement._values.flags["WRITEABLE"]: + # only relevant for ArrayManager; construction + # path for BlockManager ensures writeability + replacement = replacement.copy() + # Note: operating on ._values is much faster than directly + # TODO: can we fix that? + replacement._values[missing] = np.nan + replacements[i] = replacement + if replacements: + for idx, value in replacements.items(): + data.isetitem(idx, value) + return data + + def _insert_strls(self, data: DataFrame) -> DataFrame: + if not hasattr(self, "GSO") or len(self.GSO) == 0: + return data + for i, typ in enumerate(self._typlist): + if typ != "Q": + continue + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]]) + return data + + def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame: + if not self._column_selector_set: + column_set = set(columns) + if len(column_set) != len(columns): + raise ValueError("columns contains duplicate entries") + unmatched = column_set.difference(data.columns) + if unmatched: + joined = ", ".join(list(unmatched)) + raise ValueError( + "The following columns were not " + f"found in the Stata data set: {joined}" + ) + # Copy information for retained columns for later processing + dtyplist = [] + typlist = [] + fmtlist = [] + lbllist = [] + for col in columns: + i = data.columns.get_loc(col) + dtyplist.append(self._dtyplist[i]) + typlist.append(self._typlist[i]) + fmtlist.append(self._fmtlist[i]) + lbllist.append(self._lbllist[i]) + + self._dtyplist = dtyplist + self._typlist = typlist + self._fmtlist = fmtlist + self._lbllist = lbllist + self._column_selector_set = True + + return data[columns] + + def _do_convert_categoricals( + self, + data: DataFrame, + value_label_dict: dict[str, dict[float, str]], + lbllist: Sequence[str], + order_categoricals: bool, + ) -> DataFrame: + """ + Converts categorical columns to Categorical type. + """ + if not value_label_dict: + return data + cat_converted_data = [] + for col, label in zip(data, lbllist): + if label in value_label_dict: + # Explicit call with ordered=True + vl = value_label_dict[label] + keys = np.array(list(vl.keys())) + column = data[col] + key_matches = column.isin(keys) + if self._using_iterator and key_matches.all(): + initial_categories: np.ndarray | None = keys + # If all categories are in the keys and we are iterating, + # use the same keys for all chunks. If some are missing + # value labels, then we will fall back to the categories + # varying across chunks. + else: + if self._using_iterator: + # warn is using an iterator + warnings.warn( + categorical_conversion_warning, + CategoricalConversionWarning, + stacklevel=find_stack_level(), + ) + initial_categories = None + cat_data = Categorical( + column, categories=initial_categories, ordered=order_categoricals + ) + if initial_categories is None: + # If None here, then we need to match the cats in the Categorical + categories = [] + for category in cat_data.categories: + if category in vl: + categories.append(vl[category]) + else: + categories.append(category) + else: + # If all cats are matched, we can use the values + categories = list(vl.values()) + try: + # Try to catch duplicate categories + # TODO: if we get a non-copying rename_categories, use that + cat_data = cat_data.rename_categories(categories) + except ValueError as err: + vc = Series(categories, copy=False).value_counts() + repeated_cats = list(vc.index[vc > 1]) + repeats = "-" * 80 + "\n" + "\n".join(repeated_cats) + # GH 25772 + msg = f""" +Value labels for column {col} are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are: +{repeats} +""" + raise ValueError(msg) from err + # TODO: is the next line needed above in the data(...) method? + cat_series = Series(cat_data, index=data.index, copy=False) + cat_converted_data.append((col, cat_series)) + else: + cat_converted_data.append((col, data[col])) + data = DataFrame(dict(cat_converted_data), copy=False) + return data + + @property + def data_label(self) -> str: + """ + Return data label of Stata file. + + Examples + -------- + >>> df = pd.DataFrame([(1,)], columns=["variable"]) + >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) + >>> data_label = "This is a data file." + >>> path = "/My_path/filename.dta" + >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP + ... data_label=data_label, # doctest: +SKIP + ... version=None) # doctest: +SKIP + >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + ... print(reader.data_label) # doctest: +SKIP + This is a data file. + """ + self._ensure_open() + return self._data_label + + @property + def time_stamp(self) -> str: + """ + Return time stamp of Stata file. + """ + self._ensure_open() + return self._time_stamp + + def variable_labels(self) -> dict[str, str]: + """ + Return a dict associating each variable name with corresponding label. + + Returns + ------- + dict + + Examples + -------- + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"]) + >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) + >>> path = "/My_path/filename.dta" + >>> variable_labels = {"col_1": "This is an example"} + >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP + ... variable_labels=variable_labels, version=None) # doctest: +SKIP + >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + ... print(reader.variable_labels()) # doctest: +SKIP + {'index': '', 'col_1': 'This is an example', 'col_2': ''} + >>> pd.read_stata(path) # doctest: +SKIP + index col_1 col_2 + 0 0 1 2 + 1 1 3 4 + """ + self._ensure_open() + return dict(zip(self._varlist, self._variable_labels)) + + def value_labels(self) -> dict[str, dict[float, str]]: + """ + Return a nested dict associating each variable name to its value and label. + + Returns + ------- + dict + + Examples + -------- + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"]) + >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) + >>> path = "/My_path/filename.dta" + >>> value_labels = {"col_1": {3: "x"}} + >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP + ... value_labels=value_labels, version=None) # doctest: +SKIP + >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + ... print(reader.value_labels()) # doctest: +SKIP + {'col_1': {3: 'x'}} + >>> pd.read_stata(path) # doctest: +SKIP + index col_1 col_2 + 0 0 1 2 + 1 1 x 4 + """ + if not self._value_labels_read: + self._read_value_labels() + + return self._value_label_dict + + +@Appender(_read_stata_doc) +def read_stata( + filepath_or_buffer: FilePath | ReadBuffer[bytes], + *, + convert_dates: bool = True, + convert_categoricals: bool = True, + index_col: str | None = None, + convert_missing: bool = False, + preserve_dtypes: bool = True, + columns: Sequence[str] | None = None, + order_categoricals: bool = True, + chunksize: int | None = None, + iterator: bool = False, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, +) -> DataFrame | StataReader: + reader = StataReader( + filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index_col=index_col, + convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, + storage_options=storage_options, + compression=compression, + ) + + if iterator or chunksize: + return reader + + with reader: + return reader.read() + + +def _set_endianness(endianness: str) -> str: + if endianness.lower() in ["<", "little"]: + return "<" + elif endianness.lower() in [">", "big"]: + return ">" + else: # pragma : no cover + raise ValueError(f"Endianness {endianness} not understood") + + +def _pad_bytes(name: AnyStr, length: int) -> AnyStr: + """ + Take a char string and pads it with null bytes until it's length chars. + """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) + return name + "\x00" * (length - len(name)) + + +def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: + """ + Convert from one of the stata date formats to a type in TYPE_MAP. + """ + if fmt in [ + "tc", + "%tc", + "td", + "%td", + "tw", + "%tw", + "tm", + "%tm", + "tq", + "%tq", + "th", + "%th", + "ty", + "%ty", + ]: + return np.dtype(np.float64) # Stata expects doubles for SIFs + else: + raise NotImplementedError(f"Format {fmt} not implemented") + + +def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: + new_dict = {} + for key in convert_dates: + if not convert_dates[key].startswith("%"): # make sure proper fmts + convert_dates[key] = "%" + convert_dates[key] + if key in varlist: + new_dict.update({varlist.index(key): convert_dates[key]}) + else: + if not isinstance(key, int): + raise ValueError("convert_dates key must be a column or an integer") + new_dict.update({key: convert_dates[key]}) + return new_dict + + +def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: + """ + Convert dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 244 are strings of this length + Pandas Stata + 251 - for int8 byte + 252 - for int16 int + 253 - for int32 long + 254 - for float32 float + 255 - for double double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + # TODO: expand to handle datetime to integer conversion + if dtype.type is np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we + # do? + itemsize = max_len_string_array(ensure_object(column._values)) + return max(itemsize, 1) + elif dtype.type is np.float64: + return 255 + elif dtype.type is np.float32: + return 254 + elif dtype.type is np.int32: + return 253 + elif dtype.type is np.int16: + return 252 + elif dtype.type is np.int8: + return 251 + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +def _dtype_to_default_stata_fmt( + dtype, column: Series, dta_version: int = 114, force_strl: bool = False +) -> str: + """ + Map numpy dtype to stata's default format for this type. Not terribly + important since users can change this in Stata. Semantics are + + object -> "%DDs" where DD is the length of the string. If not a string, + raise ValueError + float64 -> "%10.0g" + float32 -> "%9.0g" + int64 -> "%9.0g" + int32 -> "%12.0g" + int16 -> "%8.0g" + int8 -> "%8.0g" + strl -> "%9s" + """ + # TODO: Refactor to combine type with format + # TODO: expand this to handle a default datetime format? + if dta_version < 117: + max_str_len = 244 + else: + max_str_len = 2045 + if force_strl: + return "%9s" + if dtype.type is np.object_: + itemsize = max_len_string_array(ensure_object(column._values)) + if itemsize > max_str_len: + if dta_version >= 117: + return "%9s" + else: + raise ValueError(excessive_string_length_error.format(column.name)) + return "%" + str(max(itemsize, 1)) + "s" + elif dtype == np.float64: + return "%10.0g" + elif dtype == np.float32: + return "%9.0g" + elif dtype == np.int32: + return "%12.0g" + elif dtype in (np.int8, np.int16): + return "%8.0g" + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +@doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "fname", +) +class StataWriter(StataParser): + """ + A class for writing Stata binary dta files + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. + + .. versionadded:: 1.4.0 + + Returns + ------- + writer : StataWriter instance + The StataWriter instance has a write_file method, which will + write the file to the given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) + >>> writer = StataWriter('./data_file.dta', data) + >>> writer.write_file() + + Directly write a zip file + >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}} + >>> writer = StataWriter('./data_file.zip', data, compression=compression) + >>> writer.write_file() + + Save a DataFrame with dates + >>> from datetime import datetime + >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) + >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}}) + >>> writer.write_file() + """ + + _max_string_length = 244 + _encoding: Literal["latin-1", "utf-8"] = "latin-1" + + def __init__( + self, + fname: FilePath | WriteBuffer[bytes], + data: DataFrame, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: str | None = None, + time_stamp: datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + *, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + super().__init__() + self.data = data + self._convert_dates = {} if convert_dates is None else convert_dates + self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label + self._variable_labels = variable_labels + self._non_cat_value_labels = value_labels + self._value_labels: list[StataValueLabel] = [] + self._has_value_labels = np.array([], dtype=bool) + self._compression = compression + self._output_file: IO[bytes] | None = None + self._converted_names: dict[Hashable, str] = {} + # attach nobs, nvars, data, varlist, typlist + self._prepare_pandas(data) + self.storage_options = storage_options + + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + self._fname = fname + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} + + def _write(self, to_write: str) -> None: + """ + Helper to call encode before writing to file for Python 3 compat. + """ + self.handles.handle.write(to_write.encode(self._encoding)) + + def _write_bytes(self, value: bytes) -> None: + """ + Helper to assert file is open before writing. + """ + self.handles.handle.write(value) + + def _prepare_non_cat_value_labels( + self, data: DataFrame + ) -> list[StataNonCatValueLabel]: + """ + Check for value labels provided for non-categorical columns. Value + labels + """ + non_cat_value_labels: list[StataNonCatValueLabel] = [] + if self._non_cat_value_labels is None: + return non_cat_value_labels + + for labname, labels in self._non_cat_value_labels.items(): + if labname in self._converted_names: + colname = self._converted_names[labname] + elif labname in data.columns: + colname = str(labname) + else: + raise KeyError( + f"Can't create value labels for {labname}, it wasn't " + "found in the dataset." + ) + + if not is_numeric_dtype(data[colname].dtype): + # Labels should not be passed explicitly for categorical + # columns that will be converted to int + raise ValueError( + f"Can't create value labels for {labname}, value labels " + "can only be applied to numeric columns." + ) + svl = StataNonCatValueLabel(colname, labels, self._encoding) + non_cat_value_labels.append(svl) + return non_cat_value_labels + + def _prepare_categoricals(self, data: DataFrame) -> DataFrame: + """ + Check for categorical columns, retain categorical information for + Stata file and convert categorical data to int + """ + is_cat = [isinstance(dtype, CategoricalDtype) for dtype in data.dtypes] + if not any(is_cat): + return data + + self._has_value_labels |= np.array(is_cat) + + get_base_missing_value = StataMissingValue.get_base_missing_value + data_formatted = [] + for col, col_is_cat in zip(data, is_cat): + if col_is_cat: + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) + dtype = data[col].cat.codes.dtype + if dtype == np.int64: + raise ValueError( + "It is not possible to export " + "int64-based categorical data to Stata." + ) + values = data[col].cat.codes._values.copy() + + # Upcast if needed so that correct missing values can be set + if values.max() >= get_base_missing_value(dtype): + if dtype == np.int8: + dtype = np.dtype(np.int16) + elif dtype == np.int16: + dtype = np.dtype(np.int32) + else: + dtype = np.dtype(np.float64) + values = np.array(values, dtype=dtype) + + # Replace missing values with Stata missing value for type + values[values == -1] = get_base_missing_value(dtype) + data_formatted.append((col, values)) + else: + data_formatted.append((col, data[col])) + return DataFrame.from_dict(dict(data_formatted)) + + def _replace_nans(self, data: DataFrame) -> DataFrame: + # return data + """ + Checks floating point data columns for nans, and replaces these with + the generic Stata for missing value (.) + """ + for c in data: + dtype = data[c].dtype + if dtype in (np.float32, np.float64): + if dtype == np.float32: + replacement = self.MISSING_VALUES["f"] + else: + replacement = self.MISSING_VALUES["d"] + data[c] = data[c].fillna(replacement) + + return data + + def _update_strl_names(self) -> None: + """No-op, forward compatibility""" + + def _validate_variable_name(self, name: str) -> str: + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + + def _check_column_names(self, data: DataFrame) -> DataFrame: + """ + Checks column names to ensure that they are valid Stata column names. + This includes checks for: + * Non-string names + * Stata keywords + * Variables that start with numbers + * Variables with names that are too long + + When an illegal variable name is detected, it is converted, and if + dates are exported, the variable name is propagated to the date + conversion dictionary + """ + converted_names: dict[Hashable, str] = {} + columns = list(data.columns) + original_columns = columns[:] + + duplicate_var_id = 0 + for j, name in enumerate(columns): + orig_name = name + if not isinstance(name, str): + name = str(name) + + name = self._validate_variable_name(name) + + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = "_" + name + + # Variable name may not start with a number + if "0" <= name[0] <= "9": + name = "_" + name + + name = name[: min(len(name), 32)] + + if not name == orig_name: + # check for duplicates + while columns.count(name) > 0: + # prepend ascending number to avoid duplicates + name = "_" + str(duplicate_var_id) + name + name = name[: min(len(name), 32)] + duplicate_var_id += 1 + converted_names[orig_name] = name + + columns[j] = name + + data.columns = Index(columns) + + # Check date conversion, and fix key if needed + if self._convert_dates: + for c, o in zip(columns, original_columns): + if c != o: + self._convert_dates[c] = self._convert_dates[o] + del self._convert_dates[o] + + if converted_names: + conversion_warning = [] + for orig_name, name in converted_names.items(): + msg = f"{orig_name} -> {name}" + conversion_warning.append(msg) + + ws = invalid_name_doc.format("\n ".join(conversion_warning)) + warnings.warn( + ws, + InvalidColumnName, + stacklevel=find_stack_level(), + ) + + self._converted_names = converted_names + self._update_strl_names() + + return data + + def _set_formats_and_types(self, dtypes: Series) -> None: + self.fmtlist: list[str] = [] + self.typlist: list[int] = [] + for col, dtype in dtypes.items(): + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) + + def _prepare_pandas(self, data: DataFrame) -> None: + # NOTE: we might need a different API / class for pandas objects so + # we can set different semantics - handle this with a PR to pandas.io + + data = data.copy() + + if self._write_index: + temp = data.reset_index() + if isinstance(temp, DataFrame): + data = temp + + # Ensure column names are strings + data = self._check_column_names(data) + + # Check columns for compatibility with stata, upcast if necessary + # Raise if outside the supported range + data = _cast_to_stata_types(data) + + # Replace NaNs with Stata missing values + data = self._replace_nans(data) + + # Set all columns to initially unlabelled + self._has_value_labels = np.repeat(False, data.shape[1]) + + # Create value labels for non-categorical data + non_cat_value_labels = self._prepare_non_cat_value_labels(data) + + non_cat_columns = [svl.labname for svl in non_cat_value_labels] + has_non_cat_val_labels = data.columns.isin(non_cat_columns) + self._has_value_labels |= has_non_cat_val_labels + self._value_labels.extend(non_cat_value_labels) + + # Convert categoricals to int data, and strip labels + data = self._prepare_categoricals(data) + + self.nobs, self.nvar = data.shape + self.data = data + self.varlist = data.columns.tolist() + + dtypes = data.dtypes + + # Ensure all date columns are converted + for col in data: + if col in self._convert_dates: + continue + if lib.is_np_dtype(data[col].dtype, "M"): + self._convert_dates[col] = "tc" + + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) + dtypes.iloc[key] = np.dtype(new_type) + + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) + + # set the given format for the datetime cols + if self._convert_dates is not None: + for key in self._convert_dates: + if isinstance(key, int): + self.fmtlist[key] = self._convert_dates[key] + + def _encode_strings(self) -> None: + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type is np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype == "string") or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded._values)) + <= self._max_string_length + ): + self.data[col] = encoded + + def write_file(self) -> None: + """ + Export DataFrame object to Stata dta format. + + Examples + -------- + >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1], + ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], + ... "Y": [7, 7, 9, 8, 10], + ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), + ... }) + >>> path = "/My_path/filename.dta" + >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"}, + ... "partially_labelled": {1.0: "one", 2.0: "two"}, + ... } + >>> writer = pd.io.stata.StataWriter(path, + ... df, + ... value_labels=labels) # doctest: +SKIP + >>> writer.write_file() # doctest: +SKIP + >>> df = pd.read_stata(path) # doctest: +SKIP + >>> df # doctest: +SKIP + index fully_labelled partially_labeled Y Z + 0 0 one one 7 j + 1 1 two two 7 k + 2 2 three NaN 9 l + 3 3 three 9.0 8 k + 4 4 one NaN 10 j + """ + with get_handle( + self._fname, + "wb", + compression=self._compression, + is_text=False, + storage_options=self.storage_options, + ) as self.handles: + if self.handles.compression["method"] is not None: + # ZipFile creates a file (with the same name) for each write call. + # Write it first into a buffer and then write the buffer to the ZipFile. + self._output_file, self.handles.handle = self.handles.handle, BytesIO() + self.handles.created_handles.append(self.handles.handle) + + try: + self._write_header( + data_label=self._data_label, time_stamp=self._time_stamp + ) + self._write_map() + self._write_variable_types() + self._write_varnames() + self._write_sortlist() + self._write_formats() + self._write_value_label_names() + self._write_variable_labels() + self._write_expansion_fields() + self._write_characteristics() + records = self._prepare_data() + self._write_data(records) + self._write_strls() + self._write_value_labels() + self._write_file_close_tag() + self._write_map() + self._close() + except Exception as exc: + self.handles.close() + if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile( + self._fname + ): + try: + os.unlink(self._fname) + except OSError: + warnings.warn( + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", + ResourceWarning, + stacklevel=find_stack_level(), + ) + raise exc + + def _close(self) -> None: + """ + Close the file if it was created by the writer. + + If a buffer or file-like object was passed in, for example a GzipFile, + then leave this file open for the caller to close. + """ + # write compression + if self._output_file is not None: + assert isinstance(self.handles.handle, BytesIO) + bio, self.handles.handle = self.handles.handle, self._output_file + self.handles.handle.write(bio.getvalue()) + + def _write_map(self) -> None: + """No-op, future compatibility""" + + def _write_file_close_tag(self) -> None: + """No-op, future compatibility""" + + def _write_characteristics(self) -> None: + """No-op, future compatibility""" + + def _write_strls(self) -> None: + """No-op, future compatibility""" + + def _write_expansion_fields(self) -> None: + """Write 5 zeros for expansion fields""" + self._write(_pad_bytes("", 5)) + + def _write_value_labels(self) -> None: + for vl in self._value_labels: + self._write_bytes(vl.generate_value_label(self._byteorder)) + + def _write_header( + self, + data_label: str | None = None, + time_stamp: datetime | None = None, + ) -> None: + byteorder = self._byteorder + # ds_format - just use 114 + self._write_bytes(struct.pack("b", 114)) + # byteorder + self._write(byteorder == ">" and "\x01" or "\x02") + # filetype + self._write("\x01") + # unused + self._write("\x00") + # number of vars, 2 bytes + self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2]) + # number of obs, 4 bytes + self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4]) + # data label 81 bytes, char, null terminated + if data_label is None: + self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80))) + else: + self._write_bytes( + self._null_terminate_bytes(_pad_bytes(data_label[:80], 80)) + ) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.now() + elif not isinstance(time_stamp, datetime): + raise ValueError("time_stamp should be datetime type") + # GH #13856 + # Avoid locale-specific month conversion + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] + month_lookup = {i + 1: month for i, month in enumerate(months)} + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) + self._write_bytes(self._null_terminate_bytes(ts)) + + def _write_variable_types(self) -> None: + for typ in self.typlist: + self._write_bytes(struct.pack("B", typ)) + + def _write_varnames(self) -> None: + # varlist names are checked by _check_column_names + # varlist, requires null terminated + for name in self.varlist: + name = self._null_terminate_str(name) + name = _pad_bytes(name[:32], 33) + self._write(name) + + def _write_sortlist(self) -> None: + # srtlist, 2*(nvar+1), int array, encoded by byteorder + srtlist = _pad_bytes("", 2 * (self.nvar + 1)) + self._write(srtlist) + + def _write_formats(self) -> None: + # fmtlist, 49*nvar, char array + for fmt in self.fmtlist: + self._write(_pad_bytes(fmt, 49)) + + def _write_value_label_names(self) -> None: + # lbllist, 33*nvar, char array + for i in range(self.nvar): + # Use variable name when categorical + if self._has_value_labels[i]: + name = self.varlist[i] + name = self._null_terminate_str(name) + name = _pad_bytes(name[:32], 33) + self._write(name) + else: # Default is empty label + self._write(_pad_bytes("", 33)) + + def _write_variable_labels(self) -> None: + # Missing labels are 80 blank characters plus null termination + blank = _pad_bytes("", 81) + + if self._variable_labels is None: + for i in range(self.nvar): + self._write(blank) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError("Variable labels must be 80 characters or fewer") + is_latin1 = all(ord(c) < 256 for c in label) + if not is_latin1: + raise ValueError( + "Variable labels must contain only characters that " + "can be encoded in Latin-1" + ) + self._write(_pad_bytes(label, 81)) + else: + self._write(blank) + + def _convert_strls(self, data: DataFrame) -> DataFrame: + """No-op, future compatibility""" + return data + + def _prepare_data(self) -> np.rec.recarray: + data = self.data + typlist = self.typlist + convert_dates = self._convert_dates + # 1. Convert dates + if self._convert_dates is not None: + for i, col in enumerate(data): + if i in convert_dates: + data[col] = _datetime_to_stata_elapsed_vec( + data[col], self.fmtlist[i] + ) + # 2. Convert strls + data = self._convert_strls(data) + + # 3. Convert bad string data to '' and pad to correct length + dtypes = {} + native_byteorder = self._byteorder == _set_endianness(sys.byteorder) + for i, col in enumerate(data): + typ = typlist[i] + if typ <= self._max_string_length: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + dc = data[col].fillna("") + data[col] = dc.apply(_pad_bytes, args=(typ,)) + stype = f"S{typ}" + dtypes[col] = stype + data[col] = data[col].astype(stype) + else: + dtype = data[col].dtype + if not native_byteorder: + dtype = dtype.newbyteorder(self._byteorder) + dtypes[col] = dtype + + return data.to_records(index=False, column_dtypes=dtypes) + + def _write_data(self, records: np.rec.recarray) -> None: + self._write_bytes(records.tobytes()) + + @staticmethod + def _null_terminate_str(s: str) -> str: + s += "\x00" + return s + + def _null_terminate_bytes(self, s: str) -> bytes: + return self._null_terminate_str(s).encode(self._encoding) + + +def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int: + """ + Converts dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 2045 are strings of this length + Pandas Stata + 32768 - for object strL + 65526 - for int8 byte + 65527 - for int16 int + 65528 - for int32 long + 65529 - for float32 float + 65530 - for double double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + # TODO: expand to handle datetime to integer conversion + if force_strl: + return 32768 + if dtype.type is np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we + # do? + itemsize = max_len_string_array(ensure_object(column._values)) + itemsize = max(itemsize, 1) + if itemsize <= 2045: + return itemsize + return 32768 + elif dtype.type is np.float64: + return 65526 + elif dtype.type is np.float32: + return 65527 + elif dtype.type is np.int32: + return 65528 + elif dtype.type is np.int16: + return 65529 + elif dtype.type is np.int8: + return 65530 + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +def _pad_bytes_new(name: str | bytes, length: int) -> bytes: + """ + Takes a bytes instance and pads it with null bytes until it's length chars. + """ + if isinstance(name, str): + name = bytes(name, "utf-8") + return name + b"\x00" * (length - len(name)) + + +class StataStrLWriter: + """ + Converter for Stata StrLs + + Stata StrLs map 8 byte values to strings which are stored using a + dictionary-like format where strings are keyed to two values. + + Parameters + ---------- + df : DataFrame + DataFrame to convert + columns : Sequence[str] + List of columns names to convert to StrL + version : int, optional + dta version. Currently supports 117, 118 and 119 + byteorder : str, optional + Can be ">", "<", "little", or "big". default is `sys.byteorder` + + Notes + ----- + Supports creation of the StrL block of a dta file for dta versions + 117, 118 and 119. These differ in how the GSO is stored. 118 and + 119 store the GSO lookup value as a uint32 and a uint64, while 117 + uses two uint32s. 118 and 119 also encode all strings as unicode + which is required by the format. 117 uses 'latin-1' a fixed width + encoding that extends the 7-bit ascii table with an additional 128 + characters. + """ + + def __init__( + self, + df: DataFrame, + columns: Sequence[str], + version: int = 117, + byteorder: str | None = None, + ) -> None: + if version not in (117, 118, 119): + raise ValueError("Only dta versions 117, 118 and 119 supported") + self._dta_ver = version + + self.df = df + self.columns = columns + self._gso_table = {"": (0, 0)} + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + + gso_v_type = "I" # uint32 + gso_o_type = "Q" # uint64 + self._encoding = "utf-8" + if version == 117: + o_size = 4 + gso_o_type = "I" # 117 used uint32 + self._encoding = "latin-1" + elif version == 118: + o_size = 6 + else: # version == 119 + o_size = 5 + self._o_offet = 2 ** (8 * (8 - o_size)) + self._gso_o_type = gso_o_type + self._gso_v_type = gso_v_type + + def _convert_key(self, key: tuple[int, int]) -> int: + v, o = key + return v + self._o_offet * o + + def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: + """ + Generates the GSO lookup table for the DataFrame + + Returns + ------- + gso_table : dict + Ordered dictionary using the string found as keys + and their lookup position (v,o) as values + gso_df : DataFrame + DataFrame where strl columns have been converted to + (v,o) values + + Notes + ----- + Modifies the DataFrame in-place. + + The DataFrame returned encodes the (v,o) values as uint64s. The + encoding depends on the dta version, and can be expressed as + + enc = v + o * 2 ** (o_size * 8) + + so that v is stored in the lower bits and o is in the upper + bits. o_size is + + * 117: 4 + * 118: 6 + * 119: 5 + """ + gso_table = self._gso_table + gso_df = self.df + columns = list(gso_df.columns) + selected = gso_df[self.columns] + col_index = [(col, columns.index(col)) for col in self.columns] + keys = np.empty(selected.shape, dtype=np.uint64) + for o, (idx, row) in enumerate(selected.iterrows()): + for j, (col, v) in enumerate(col_index): + val = row[col] + # Allow columns with mixed str and None (GH 23633) + val = "" if val is None else val + key = gso_table.get(val, None) + if key is None: + # Stata prefers human numbers + key = (v + 1, o + 1) + gso_table[val] = key + keys[o, j] = self._convert_key(key) + for i, col in enumerate(self.columns): + gso_df[col] = keys[:, i] + + return gso_table, gso_df + + def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: + """ + Generates the binary blob of GSOs that is written to the dta file. + + Parameters + ---------- + gso_table : dict + Ordered dictionary (str, vo) + + Returns + ------- + gso : bytes + Binary content of dta file to be placed between strl tags + + Notes + ----- + Output format depends on dta version. 117 uses two uint32s to + express v and o while 118+ uses a uint32 for v and a uint64 for o. + """ + # Format information + # Length includes null term + # 117 + # GSOvvvvooootllllxxxxxxxxxxxxxxx...x + # 3 u4 u4 u1 u4 string + null term + # + # 118, 119 + # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x + # 3 u4 u8 u1 u4 string + null term + + bio = BytesIO() + gso = bytes("GSO", "ascii") + gso_type = struct.pack(self._byteorder + "B", 130) + null = struct.pack(self._byteorder + "B", 0) + v_type = self._byteorder + self._gso_v_type + o_type = self._byteorder + self._gso_o_type + len_type = self._byteorder + "I" + for strl, vo in gso_table.items(): + if vo == (0, 0): + continue + v, o = vo + + # GSO + bio.write(gso) + + # vvvv + bio.write(struct.pack(v_type, v)) + + # oooo / oooooooo + bio.write(struct.pack(o_type, o)) + + # t + bio.write(gso_type) + + # llll + utf8_string = bytes(strl, "utf-8") + bio.write(struct.pack(len_type, len(utf8_string) + 1)) + + # xxx...xxx + bio.write(utf8_string) + bio.write(null) + + return bio.getvalue() + + +class StataWriter117(StataWriter): + """ + A class for writing Stata binary dta files in Stata 13 format (117) + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. + + .. versionadded:: 1.4.0 + + Returns + ------- + writer : StataWriter117 instance + The StataWriter117 instance has a write_file method, which will + write the file to the given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) + >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data) + >>> writer.write_file() + + Directly write a zip file + >>> compression = {"method": "zip", "archive_name": "data_file.dta"} + >>> writer = pd.io.stata.StataWriter117( + ... './data_file.zip', data, compression=compression + ... ) + >>> writer.write_file() + + Or with long strings stored in strl format + >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], + ... columns=['strls']) + >>> writer = pd.io.stata.StataWriter117( + ... './data_file_with_long_strings.dta', data, convert_strl=['strls']) + >>> writer.write_file() + """ + + _max_string_length = 2045 + _dta_version = 117 + + def __init__( + self, + fname: FilePath | WriteBuffer[bytes], + data: DataFrame, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: str | None = None, + time_stamp: datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + convert_strl: Sequence[Hashable] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + *, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + # Copy to new list since convert_strl might be modified later + self._convert_strl: list[Hashable] = [] + if convert_strl is not None: + self._convert_strl.extend(convert_strl) + + super().__init__( + fname, + data, + convert_dates, + write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + value_labels=value_labels, + compression=compression, + storage_options=storage_options, + ) + self._map: dict[str, int] = {} + self._strl_blob = b"" + + @staticmethod + def _tag(val: str | bytes, tag: str) -> bytes: + """Surround val with """ + if isinstance(val, str): + val = bytes(val, "utf-8") + return bytes("<" + tag + ">", "utf-8") + val + bytes("", "utf-8") + + def _update_map(self, tag: str) -> None: + """Update map location for tag with file position""" + assert self.handles.handle is not None + self._map[tag] = self.handles.handle.tell() + + def _write_header( + self, + data_label: str | None = None, + time_stamp: datetime | None = None, + ) -> None: + """Write the file header""" + byteorder = self._byteorder + self._write_bytes(bytes("", "utf-8")) + bio = BytesIO() + # ds_format - 117 + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) + # byteorder + bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + # number of vars, 2 bytes in 117 and 118, 4 byte in 119 + nvar_type = "H" if self._dta_version <= 118 else "I" + bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) + # data label 81 bytes, char, null terminated + label = data_label[:80] if data_label is not None else "" + encoded_label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(encoded_label)) + encoded_label = label_len + encoded_label + bio.write(self._tag(encoded_label, "label")) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.now() + elif not isinstance(time_stamp, datetime): + raise ValueError("time_stamp should be datetime type") + # Avoid locale-specific month conversion + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] + month_lookup = {i + 1: month for i, month in enumerate(months)} + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) + # '\x11' added due to inspection of Stata file + stata_ts = b"\x11" + bytes(ts, "utf-8") + bio.write(self._tag(stata_ts, "timestamp")) + self._write_bytes(self._tag(bio.getvalue(), "header")) + + def _write_map(self) -> None: + """ + Called twice during file write. The first populates the values in + the map with 0s. The second call writes the final map locations when + all blocks have been written. + """ + if not self._map: + self._map = { + "stata_data": 0, + "map": self.handles.handle.tell(), + "variable_types": 0, + "varnames": 0, + "sortlist": 0, + "formats": 0, + "value_label_names": 0, + "variable_labels": 0, + "characteristics": 0, + "data": 0, + "strls": 0, + "value_labels": 0, + "stata_data_close": 0, + "end-of-file": 0, + } + # Move to start of map + self.handles.handle.seek(self._map["map"]) + bio = BytesIO() + for val in self._map.values(): + bio.write(struct.pack(self._byteorder + "Q", val)) + self._write_bytes(self._tag(bio.getvalue(), "map")) + + def _write_variable_types(self) -> None: + self._update_map("variable_types") + bio = BytesIO() + for typ in self.typlist: + bio.write(struct.pack(self._byteorder + "H", typ)) + self._write_bytes(self._tag(bio.getvalue(), "variable_types")) + + def _write_varnames(self) -> None: + self._update_map("varnames") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 + for name in self.varlist: + name = self._null_terminate_str(name) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) + bio.write(name) + self._write_bytes(self._tag(bio.getvalue(), "varnames")) + + def _write_sortlist(self) -> None: + self._update_map("sortlist") + sort_size = 2 if self._dta_version < 119 else 4 + self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) + + def _write_formats(self) -> None: + self._update_map("formats") + bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 + for fmt in self.fmtlist: + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) + self._write_bytes(self._tag(bio.getvalue(), "formats")) + + def _write_value_label_names(self) -> None: + self._update_map("value_label_names") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 + for i in range(self.nvar): + # Use variable name when categorical + name = "" # default name + if self._has_value_labels[i]: + name = self.varlist[i] + name = self._null_terminate_str(name) + encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) + bio.write(encoded_name) + self._write_bytes(self._tag(bio.getvalue(), "value_label_names")) + + def _write_variable_labels(self) -> None: + # Missing labels are 80 blank characters plus null termination + self._update_map("variable_labels") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) + + if self._variable_labels is None: + for _ in range(self.nvar): + bio.write(blank) + self._write_bytes(self._tag(bio.getvalue(), "variable_labels")) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError("Variable labels must be 80 characters or fewer") + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError as err: + raise ValueError( + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" + ) from err + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) + else: + bio.write(blank) + self._write_bytes(self._tag(bio.getvalue(), "variable_labels")) + + def _write_characteristics(self) -> None: + self._update_map("characteristics") + self._write_bytes(self._tag(b"", "characteristics")) + + def _write_data(self, records) -> None: + self._update_map("data") + self._write_bytes(b"") + self._write_bytes(records.tobytes()) + self._write_bytes(b"") + + def _write_strls(self) -> None: + self._update_map("strls") + self._write_bytes(self._tag(self._strl_blob, "strls")) + + def _write_expansion_fields(self) -> None: + """No-op in dta 117+""" + + def _write_value_labels(self) -> None: + self._update_map("value_labels") + bio = BytesIO() + for vl in self._value_labels: + lab = vl.generate_value_label(self._byteorder) + lab = self._tag(lab, "lbl") + bio.write(lab) + self._write_bytes(self._tag(bio.getvalue(), "value_labels")) + + def _write_file_close_tag(self) -> None: + self._update_map("stata_data_close") + self._write_bytes(bytes("", "utf-8")) + self._update_map("end-of-file") + + def _update_strl_names(self) -> None: + """ + Update column names for conversion to strl if they might have been + changed to comply with Stata naming rules + """ + # Update convert_strl if names changed + for orig, new in self._converted_names.items(): + if orig in self._convert_strl: + idx = self._convert_strl.index(orig) + self._convert_strl[idx] = new + + def _convert_strls(self, data: DataFrame) -> DataFrame: + """ + Convert columns to StrLs if either very large or in the + convert_strl variable + """ + convert_cols = [ + col + for i, col in enumerate(data) + if self.typlist[i] == 32768 or col in self._convert_strl + ] + + if convert_cols: + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + tab, new_data = ssw.generate_table() + data = new_data + self._strl_blob = ssw.generate_blob(tab) + return data + + def _set_formats_and_types(self, dtypes: Series) -> None: + self.typlist = [] + self.fmtlist = [] + for col, dtype in dtypes.items(): + force_strl = col in self._convert_strl + fmt = _dtype_to_default_stata_fmt( + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, + ) + self.fmtlist.append(fmt) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriterUTF8(StataWriter117): + """ + Stata binary dta file writing in Stata 15 (118) and 16 (119) formats + + DTA 118 and 119 format files support unicode string data (both fixed + and strL) format. Unicode is also supported in value labels, variable + labels and the dataset label. Format 119 is automatically used if the + file contains more than 32,767 variables. + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict, default None + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool, default True + Write the index to Stata dataset. + byteorder : str, default None + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime, default None + A datetime to use as file creation date. Default is the current time + data_label : str, default None + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict, default None + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list, default None + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + version : int, default None + The dta version to use. By default, uses the size of data to determine + the version. 118 is used if data.shape[1] <= 32767, and 119 is used + for storing larger DataFrames. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. The combined length of all labels for a single + variable must be 32,000 characters or smaller. + + .. versionadded:: 1.4.0 + + Returns + ------- + StataWriterUTF8 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriterUTF8 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriterUTF8('./data_file.dta', data) + >>> writer.write_file() + + Directly write a zip file + >>> compression = {"method": "zip", "archive_name": "data_file.dta"} + >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding: Literal["utf-8"] = "utf-8" + + def __init__( + self, + fname: FilePath | WriteBuffer[bytes], + data: DataFrame, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: str | None = None, + time_stamp: datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + convert_strl: Sequence[Hashable] | None = None, + version: int | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + *, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + if version is None: + version = 118 if data.shape[1] <= 32767 else 119 + elif version not in (118, 119): + raise ValueError("version must be either 118 or 119.") + elif version == 118 and data.shape[1] > 32767: + raise ValueError( + "You must use version 119 for data sets containing more than" + "32,767 variables" + ) + + super().__init__( + fname, + data, + convert_dates=convert_dates, + write_index=write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + value_labels=value_labels, + convert_strl=convert_strl, + compression=compression, + storage_options=storage_options, + ) + # Override version set in StataWriter117 init + self._dta_version = version + + def _validate_variable_name(self, name: str) -> str: + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118+ support most unicode characters. The only limitation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) + or 128 <= ord(c) < 192 + or c in {"×", "÷"} # noqa: RUF001 + ): + name = name.replace(c, "_") + + return name diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/xml.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/xml.py new file mode 100644 index 0000000000000000000000000000000000000000..ac497cd266027f7af71884996182e1725baba361 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/xml.py @@ -0,0 +1,1177 @@ +""" +:mod:``pandas.io.xml`` is a module for reading XML. +""" + +from __future__ import annotations + +import io +from os import PathLike +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) +import warnings + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + ParserError, +) +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + file_exists, + get_handle, + infer_compression, + is_file_like, + is_fsspec_url, + is_url, + stringify_path, +) +from pandas.io.parsers import TextParser + +if TYPE_CHECKING: + from collections.abc import Sequence + from xml.etree.ElementTree import Element + + from lxml import etree + + from pandas._typing import ( + CompressionOptions, + ConvertersArg, + DtypeArg, + DtypeBackend, + FilePath, + ParseDatesArg, + ReadBuffer, + StorageOptions, + XMLParsers, + ) + + from pandas import DataFrame + + +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", +) +class _XMLFrameParser: + """ + Internal subclass to parse XML into DataFrames. + + Parameters + ---------- + path_or_buffer : a valid JSON ``str``, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. + + xpath : str or regex + The ``XPath`` expression to parse required set of nodes for + migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``. + + namespaces : dict + The namespaces defined in XML document (``xmlns:namespace='URI'``) + as dicts with key being namespace and value the URI. + + elems_only : bool + Parse only the child elements at the specified ``xpath``. + + attrs_only : bool + Parse only the attributes at the specified ``xpath``. + + names : list + Column names for :class:`~pandas.DataFrame` of parsed XML data. + + dtype : dict + Data type for data or columns. E.g. {{'a': np.float64, + 'b': np.int32, 'c': 'Int64'}} + + .. versionadded:: 1.5.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels. + + .. versionadded:: 1.5.0 + + parse_dates : bool or list of int or names or list of lists or dict + Converts either index or select columns to datetimes + + .. versionadded:: 1.5.0 + + encoding : str + Encoding of xml object or document. + + stylesheet : str or file-like + URL, file, file-like object, or a raw string containing XSLT, + ``etree`` does not support XSLT but retained for consistency. + + iterparse : dict, optional + Dict with row element as key and list of descendant elements + and/or attributes as value to be retrieved in iterparsing of + XML document. + + .. versionadded:: 1.5.0 + + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + See also + -------- + pandas.io.xml._EtreeFrameParser + pandas.io.xml._LxmlFrameParser + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`parse_data` + * :func:`_parse_nodes` + * :func:`_iterparse_nodes` + * :func:`_parse_doc` + * :func:`_validate_names` + * :func:`_validate_path` + + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict[str, str] | None, + elems_only: bool, + attrs_only: bool, + names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, + encoding: str | None, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, + compression: CompressionOptions, + storage_options: StorageOptions, + ) -> None: + self.path_or_buffer = path_or_buffer + self.xpath = xpath + self.namespaces = namespaces + self.elems_only = elems_only + self.attrs_only = attrs_only + self.names = names + self.dtype = dtype + self.converters = converters + self.parse_dates = parse_dates + self.encoding = encoding + self.stylesheet = stylesheet + self.iterparse = iterparse + self.is_style = None + self.compression: CompressionOptions = compression + self.storage_options = storage_options + + def parse_data(self) -> list[dict[str, str | None]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate ``xpath``, names, parse and return specific nodes. + """ + + raise AbstractMethodError(self) + + def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: + """ + Parse xml nodes. + + This method will parse the children and attributes of elements + in ``xpath``, conditionally for only elements, only attributes + or both while optionally renaming node names. + + Raises + ------ + ValueError + * If only elements and only attributes are specified. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes compared to siblings + will have optional keys filled with None values. + """ + + dicts: list[dict[str, str | None]] + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + if self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + else: + dicts = [ + {ch.tag: ch.text if ch.text else None for ch in el.findall("*")} + for el in elems + ] + + elif self.attrs_only: + dicts = [ + {k: v if v else None for k, v in el.attrib.items()} for el in elems + ] + + elif self.names: + dicts = [ + { + **el.attrib, + **({el.tag: el.text} if el.text and not el.text.isspace() else {}), + **{ + nm: ch.text if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + + else: + dicts = [ + { + **el.attrib, + **({el.tag: el.text} if el.text and not el.text.isspace() else {}), + **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")}, + } + for el in elems + ] + + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [dict(zip(self.names, d.values())) for d in dicts] + + return dicts + + def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: + """ + Iterparse xml nodes. + + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. + + Raises + ------ + TypeError + * If ``iterparse`` is not a dict or its dict value is not list-like. + ParserError + * If ``path_or_buffer`` is not a physical file on disk or file-like object. + * If no data is returned from selected items in ``iterparse``. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list + will have optional keys filled with None values. + """ + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if (not hasattr(self.path_or_buffer, "read")) and ( + not isinstance(self.path_or_buffer, (str, PathLike)) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or ( + isinstance(self.path_or_buffer, str) + and self.path_or_buffer.startswith((" list[Any]: + """ + Validate ``xpath``. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> Element | etree._Element: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML module: `xml.etree.ElementTree`. + """ + + def parse_data(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import iterparse + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + if self.iterparse is None: + self.xml_doc = self._parse_doc(self.path_or_buffer) + elems = self._validate_path() + + self._validate_names() + + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) + ) + + return xml_dicts + + def _validate_path(self) -> list[Any]: + """ + Notes + ----- + ``etree`` supports limited ``XPath``. If user attempts a more complex + expression syntax error will raise. + """ + + msg = ( + "xpath does not return any nodes or attributes. " + "Be sure to specify in `xpath` the parent nodes of " + "children and attributes to parse. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + try: + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + children = [ch for el in elems for ch in el.findall("*")] + attrs = {k: v for el in elems for k, v in el.attrib.items()} + + if elems is None: + raise ValueError(msg) + + if elems is not None: + if self.elems_only and children == []: + raise ValueError(msg) + if self.attrs_only and attrs == {}: + raise ValueError(msg) + if children == [] and attrs == {}: + raise ValueError(msg) + + except (KeyError, SyntaxError): + raise SyntaxError( + "You have used an incorrect or unsupported XPath " + "expression for etree library or you used an " + "undeclared namespace prefix." + ) + + return elems + + def _validate_names(self) -> None: + children: list[Any] + + if self.names: + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + children = parent.findall("*") if parent is not None else [] + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> Element: + from xml.etree.ElementTree import ( + XMLParser, + parse, + ) + + handle_data = get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + document = parse(xml_data, parser=curr_parser) + + return document.getroot() + + +class _LxmlFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into :class:`~pandas.DataFrame` with third-party + full-featured XML library, ``lxml``, that supports + ``XPath`` 1.0 and XSLT 1.0. + """ + + def parse_data(self) -> list[dict[str, str | None]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate ``xpath``, names, optionally parse and run XSLT, + and parse original or transformed XML and return specific nodes. + """ + from lxml.etree import iterparse + + if self.iterparse is None: + self.xml_doc = self._parse_doc(self.path_or_buffer) + + if self.stylesheet: + self.xsl_doc = self._parse_doc(self.stylesheet) + self.xml_doc = self._transform_doc() + + elems = self._validate_path() + + self._validate_names() + + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) + ) + + return xml_dicts + + def _validate_path(self) -> list[Any]: + msg = ( + "xpath does not return any nodes or attributes. " + "Be sure to specify in `xpath` the parent nodes of " + "children and attributes to parse. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + children = [ch for el in elems for ch in el.xpath("*")] + attrs = {k: v for el in elems for k, v in el.attrib.items()} + + if elems == []: + raise ValueError(msg) + + if elems != []: + if self.elems_only and children == []: + raise ValueError(msg) + if self.attrs_only and attrs == {}: + raise ValueError(msg) + if children == [] and attrs == {}: + raise ValueError(msg) + + return elems + + def _validate_names(self) -> None: + children: list[Any] + + if self.names: + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + children = self.xml_doc.xpath( + self.xpath + "[1]/*", namespaces=self.namespaces + ) + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> etree._Element: + from lxml.etree import ( + XMLParser, + fromstring, + parse, + ) + + handle_data = get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + + if isinstance(xml_data, io.StringIO): + if self.encoding is None: + raise TypeError( + "Can not pass encoding None when input is StringIO." + ) + + document = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + document = parse(xml_data, parser=curr_parser) + + return document + + def _transform_doc(self) -> etree._XSLTResultTree: + """ + Transform original tree using stylesheet. + + This method will transform original xml using XSLT script into + am ideally flatter xml document for easier parsing and migration + to Data Frame. + """ + from lxml.etree import XSLT + + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) + + return new_doc + + +def get_data_from_filepath( + filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], + encoding: str | None, + compression: CompressionOptions, + storage_options: StorageOptions, +) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: + """ + Extract raw XML data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML string or bytes + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + if not isinstance(filepath_or_buffer, bytes): + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + isinstance(filepath_or_buffer, str) + and not filepath_or_buffer.startswith((" io.StringIO | io.BytesIO: + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + +def _data_to_frame(data, **kwargs) -> DataFrame: + """ + Convert parsed data to Data Frame. + + This method will bind xml dictionary data of keys and values + into named columns of Data Frame using the built-in TextParser + class that build Data Frame and infers specific dtypes. + """ + + tags = next(iter(data)) + nodes = [list(d.values()) for d in data] + + try: + with TextParser(nodes, names=tags, **kwargs) as tp: + return tp.read() + except ParserError: + raise ParserError( + "XML document may be too complex for import. " + "Try to flatten document and use distinct " + "element and attribute names." + ) + + +def _parse( + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict[str, str] | None, + elems_only: bool, + attrs_only: bool, + names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, + encoding: str | None, + parser: XMLParsers, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, + compression: CompressionOptions, + storage_options: StorageOptions, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwargs, +) -> DataFrame: + """ + Call internal parsers. + + This method will conditionally call internal parsers: + LxmlFrameParser and/or EtreeParser. + + Raises + ------ + ImportError + * If lxml is not installed if selected as parser. + + ValueError + * If parser is not lxml or etree. + """ + + p: _EtreeFrameParser | _LxmlFrameParser + + if isinstance(path_or_buffer, str) and not any( + [ + is_file_like(path_or_buffer), + file_exists(path_or_buffer), + is_url(path_or_buffer), + is_fsspec_url(path_or_buffer), + ] + ): + warnings.warn( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if parser == "lxml": + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + if lxml is not None: + p = _LxmlFrameParser( + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + dtype, + converters, + parse_dates, + encoding, + stylesheet, + iterparse, + compression, + storage_options, + ) + else: + raise ImportError("lxml not found, please install or use the etree parser.") + + elif parser == "etree": + p = _EtreeFrameParser( + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + dtype, + converters, + parse_dates, + encoding, + stylesheet, + iterparse, + compression, + storage_options, + ) + else: + raise ValueError("Values for parser can only be lxml or etree.") + + data_dicts = p.parse_data() + + return _data_to_frame( + data=data_dicts, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, + dtype_backend=dtype_backend, + **kwargs, + ) + + +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", +) +def read_xml( + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + *, + xpath: str = "./*", + namespaces: dict[str, str] | None = None, + elems_only: bool = False, + attrs_only: bool = False, + names: Sequence[str] | None = None, + dtype: DtypeArg | None = None, + converters: ConvertersArg | None = None, + parse_dates: ParseDatesArg | None = None, + # encoding can not be None for lxml and StringIO input + encoding: str | None = "utf-8", + parser: XMLParsers = "lxml", + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, + iterparse: dict[str, list[str]] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame: + r""" + Read XML document into a :class:`~pandas.DataFrame` object. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``read()`` function. The string can be any valid XML + string or a path. The string can further be a URL. Valid URL schemes + include http, ftp, s3, and file. + + .. deprecated:: 2.1.0 + Passing xml literal strings is deprecated. + Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. + + xpath : str, optional, default './\*' + The ``XPath`` to parse required set of nodes for migration to + :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements + and not a single element. Note: The ``etree`` parser supports limited ``XPath`` + expressions. For more complex ``XPath``, use ``lxml`` which requires + installation. + + namespaces : dict, optional + The namespaces defined in XML document as dicts with key being + namespace prefix and value the URI. There is no need to include all + namespaces in XML, only the ones used in ``xpath`` expression. + Note: if XML document uses default namespace denoted as + `xmlns=''` without a prefix, you must assign any temporary + namespace prefix such as 'doc' to the URI in order to parse + underlying nodes and/or attributes. For example, :: + + namespaces = {{"doc": "https://example.com"}} + + elems_only : bool, optional, default False + Parse only the child elements at the specified ``xpath``. By default, + all child elements and non-empty text nodes are returned. + + attrs_only : bool, optional, default False + Parse only the attributes at the specified ``xpath``. + By default, all attributes are returned. + + names : list-like, optional + Column names for DataFrame of parsed XML data. Use this parameter to + rename original element names and distinguish same named elements and + attributes. + + dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 1.5.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. + + .. versionadded:: 1.5.0 + + parse_dates : bool or list of int or names or list of lists or dict, default False + Identifiers to parse index or columns to datetime. The behavior is as follows: + + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + .. versionadded:: 1.5.0 + + encoding : str, optional, default 'utf-8' + Encoding of XML document. + + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for retrieval of data. Only 'lxml' and + 'etree' are supported. With 'lxml' more complex ``XPath`` searches + and ability to use XSLT stylesheet are supported. + + stylesheet : str, path object or file-like object + A URL, file-like object, or a raw string containing an XSLT script. + This stylesheet should flatten complex, deeply nested XML documents + for easier parsing. To use this feature you must have ``lxml`` module + installed and specify 'lxml' as ``parser``. The ``xpath`` must + reference nodes of transformed XML document generated after XSLT + transformation and not the original XML document. Only XSLT 1.0 + scripts and not later versions is currently supported. + + iterparse : dict, optional + The nodes or attributes to retrieve in iterparsing of XML document + as a dict with key being the name of repeating element and value being + list of elements or attribute names that are descendants of the repeated + element. Note: If this option is used, it will replace ``xpath`` parsing + and unlike ``xpath``, descendants do not need to relate to each other but can + exist any where in document under the repeating element. This memory- + efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). + For example, :: + + iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} + + .. versionadded:: 1.5.0 + + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + df + A DataFrame. + + See Also + -------- + read_json : Convert a JSON string to pandas object. + read_html : Read HTML tables into a list of DataFrame objects. + + Notes + ----- + This method is best designed to import shallow XML documents in + following format which is the ideal fit for the two-dimensions of a + ``DataFrame`` (row by column). :: + + + + data + data + data + ... + + + ... + + ... + + + As a file format, XML documents can be designed any way including + layout of elements and attributes as long as it conforms to W3C + specifications. Therefore, this method is a convenience handler for + a specific flatter design and not all possible XML structures. + + However, for more complex XML documents, ``stylesheet`` allows you to + temporarily redesign original document with XSLT (a special purpose + language) for a flatter version for migration to a DataFrame. + + This function will *always* return a single :class:`DataFrame` or raise + exceptions due to issues with XML document, ``xpath``, or other + parameters. + + See the :ref:`read_xml documentation in the IO section of the docs + ` for more information in using this method to parse XML + files to DataFrames. + + Examples + -------- + >>> from io import StringIO + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read_xml(StringIO(xml)) + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... + ... + ... ''' + + >>> df = pd.read_xml(StringIO(xml), xpath=".//row") + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read_xml(StringIO(xml), + ... xpath="//doc:row", + ... namespaces={{"doc": "https://example.com"}}) + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml_data = ''' + ... + ... + ... 0 + ... 1 + ... 2.5 + ... True + ... a + ... 2019-12-31 00:00:00 + ... + ... + ... 1 + ... 4.5 + ... False + ... b + ... 2019-12-31 00:00:00 + ... + ... + ... ''' + + >>> df = pd.read_xml(StringIO(xml_data), + ... dtype_backend="numpy_nullable", + ... parse_dates=["e"]) + >>> df + index a b c d e + 0 0 1 2.5 True a 2019-12-31 + 1 1 4.5 False b 2019-12-31 + """ + check_dtype_backend(dtype_backend) + + return _parse( + path_or_buffer=path_or_buffer, + xpath=xpath, + namespaces=namespaces, + elems_only=elems_only, + attrs_only=attrs_only, + names=names, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, + encoding=encoding, + parser=parser, + stylesheet=stylesheet, + iterparse=iterparse, + compression=compression, + storage_options=storage_options, + dtype_backend=dtype_backend, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..55c861e384d679654b8615d4cb5808f536fd8f2e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__init__.py @@ -0,0 +1,98 @@ +""" +Plotting public API. + +Authors of third-party plotting backends should implement a module with a +public ``plot(data, kind, **kwargs)``. The parameter `data` will contain +the data structure and can be a `Series` or a `DataFrame`. For example, +for ``df.plot()`` the parameter `data` will contain the DataFrame `df`. +In some cases, the data structure is transformed before being sent to +the backend (see PlotAccessor.__call__ in pandas/plotting/_core.py for +the exact transformations). + +The parameter `kind` will be one of: + +- line +- bar +- barh +- box +- hist +- kde +- area +- pie +- scatter +- hexbin + +See the pandas API reference for documentation on each kind of plot. + +Any other keyword argument is currently assumed to be backend specific, +but some parameters may be unified and added to the signature in the +future (e.g. `title` which should be useful for any backend). + +Currently, all the Matplotlib functions in pandas are accessed through +the selected backend. For example, `pandas.plotting.boxplot` (equivalent +to `DataFrame.boxplot`) is also accessed in the selected backend. This +is expected to change, and the exact API is under discussion. But with +the current version, backends are expected to implement the next functions: + +- plot (describe above, used for `Series.plot` and `DataFrame.plot`) +- hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) +- boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) +- boxplot_frame and boxplot_frame_groupby +- register and deregister (register converters for the tick formats) +- Plots not called as `Series` and `DataFrame` methods: + - table + - andrews_curves + - autocorrelation_plot + - bootstrap_plot + - lag_plot + - parallel_coordinates + - radviz + - scatter_matrix + +Use the code in pandas/plotting/_matplotib.py and +https://github.com/pyviz/hvplot as a reference on how to write a backend. + +For the discussion about the API see +https://github.com/pandas-dev/pandas/issues/26747. +""" +from pandas.plotting._core import ( + PlotAccessor, + boxplot, + boxplot_frame, + boxplot_frame_groupby, + hist_frame, + hist_series, +) +from pandas.plotting._misc import ( + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + deregister as deregister_matplotlib_converters, + lag_plot, + parallel_coordinates, + plot_params, + radviz, + register as register_matplotlib_converters, + scatter_matrix, + table, +) + +__all__ = [ + "PlotAccessor", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "hist_frame", + "hist_series", + "scatter_matrix", + "radviz", + "andrews_curves", + "bootstrap_plot", + "parallel_coordinates", + "lag_plot", + "autocorrelation_plot", + "table", + "plot_params", + "register_matplotlib_converters", + "deregister_matplotlib_converters", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_core.py new file mode 100644 index 0000000000000000000000000000000000000000..cb5598a98d5afbc93954d74e3ecc78b4e572606d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_core.py @@ -0,0 +1,1946 @@ +from __future__ import annotations + +import importlib +from typing import ( + TYPE_CHECKING, + Callable, + Literal, +) + +from pandas._config import get_option + +from pandas.util._decorators import ( + Appender, + Substitution, +) + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +from pandas.core.base import PandasObject + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + import types + + from matplotlib.axes import Axes + import numpy as np + + from pandas._typing import IndexLabel + + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.groupby.generic import DataFrameGroupBy + + +def hist_series( + self: Series, + by=None, + ax=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + figsize: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, +): + """ + Draw histogram of the input series using matplotlib. + + Parameters + ---------- + by : object, optional + If passed, then used to form histograms for separate groups. + ax : matplotlib axis object + If not passed, uses gca(). + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. + figsize : tuple, default None + Figure size in inches by default. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + legend : bool, default False + Whether to show the legend. + + **kwargs + To be passed to the actual plotting function. + + Returns + ------- + matplotlib.AxesSubplot + A histogram plot. + + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> hist = ser.hist() + + For Groupby: + + .. plot:: + :context: close-figs + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> hist = ser.groupby(level=0).hist() + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.hist_series( + self, + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + legend=legend, + **kwargs, + ) + + +def hist_frame( + data: DataFrame, + column: IndexLabel | None = None, + by=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + ax=None, + sharex: bool = False, + sharey: bool = False, + figsize: tuple[int, int] | None = None, + layout: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, +): + """ + Make a histogram of the DataFrame's columns. + + A `histogram`_ is a representation of the distribution of data. + This function calls :meth:`matplotlib.pyplot.hist`, on each series in + the DataFrame, resulting in one histogram per column. + + .. _histogram: https://en.wikipedia.org/wiki/Histogram + + Parameters + ---------- + data : DataFrame + The pandas object holding the data. + column : str or sequence, optional + If passed, will be used to limit data to a subset of columns. + by : object, optional + If passed, then used to form histograms for separate groups. + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. For example, a value of 90 displays the + x labels rotated 90 degrees clockwise. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. For example, a value of 90 displays the + y labels rotated 90 degrees clockwise. + ax : Matplotlib axes object, default None + The axes to plot the histogram on. + sharex : bool, default True if ax is None else False + In case subplots=True, share x axis and set some x axis labels to + invisible; defaults to True if ax is None otherwise False if an ax + is passed in. + Note that passing in both an ax and sharex=True will alter all x axis + labels for all subplots in a figure. + sharey : bool, default False + In case subplots=True, share y axis and set some y axis labels to + invisible. + figsize : tuple, optional + The size in inches of the figure to create. Uses the value in + `matplotlib.rcParams` by default. + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + legend : bool, default False + Whether to show the legend. + + **kwargs + All other plotting keyword arguments to be passed to + :meth:`matplotlib.pyplot.hist`. + + Returns + ------- + matplotlib.AxesSubplot or numpy.ndarray of them + + See Also + -------- + matplotlib.pyplot.hist : Plot a histogram using matplotlib. + + Examples + -------- + This example draws a histogram based on the length and width of + some animals, displayed in three bins + + .. plot:: + :context: close-figs + + >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> df = pd.DataFrame(data, index=index) + >>> hist = df.hist(bins=3) + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.hist_frame( + data, + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + legend=legend, + bins=bins, + **kwargs, + ) + + +_boxplot_doc = """ +Make a box plot from DataFrame columns. + +Make a box-and-whisker plot from DataFrame columns, optionally grouped +by some other columns. A box plot is a method for graphically depicting +groups of numerical data through their quartiles. +The box extends from the Q1 to Q3 quartile values of the data, +with a line at the median (Q2). The whiskers extend from the edges +of box to show the range of the data. By default, they extend no more than +`1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest +data point within that interval. Outliers are plotted as separate dots. + +For further details see +Wikipedia's entry for `boxplot `_. + +Parameters +---------- +%(data)s\ +column : str or list of str, optional + Column name or list of names, or vector. + Can be any valid input to :meth:`pandas.DataFrame.groupby`. +by : str or array-like, optional + Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. + One box-plot will be done per value of columns in `by`. +ax : object of class matplotlib.axes.Axes, optional + The matplotlib axes to be used by boxplot. +fontsize : float or str + Tick label font size in points or as a string (e.g., `large`). +rot : float, default 0 + The rotation angle of labels (in degrees) + with respect to the screen coordinate system. +grid : bool, default True + Setting this to True will show the grid. +figsize : A tuple (width, height) in inches + The size of the figure to create in matplotlib. +layout : tuple (rows, columns), optional + For example, (3, 5) will display the subplots + using 3 rows and 5 columns, starting from the top-left. +return_type : {'axes', 'dict', 'both'} or None, default 'axes' + The kind of object to return. The default is ``axes``. + + * 'axes' returns the matplotlib axes the boxplot is drawn on. + * 'dict' returns a dictionary whose values are the matplotlib + Lines of the boxplot. + * 'both' returns a namedtuple with the axes and dict. + * when grouping with ``by``, a Series mapping columns to + ``return_type`` is returned. + + If ``return_type`` is `None`, a NumPy array + of axes with the same shape as ``layout`` is returned. +%(backend)s\ + +**kwargs + All other plotting keyword arguments to be passed to + :func:`matplotlib.pyplot.boxplot`. + +Returns +------- +result + See Notes. + +See Also +-------- +pandas.Series.plot.hist: Make a histogram. +matplotlib.pyplot.boxplot : Matplotlib equivalent plot. + +Notes +----- +The return type depends on the `return_type` parameter: + +* 'axes' : object of class matplotlib.axes.Axes +* 'dict' : dict of matplotlib.lines.Line2D objects +* 'both' : a namedtuple with structure (ax, lines) + +For data grouped with ``by``, return a Series of the above or a numpy +array: + +* :class:`~pandas.Series` +* :class:`~numpy.array` (for ``return_type = None``) + +Use ``return_type='dict'`` when you want to tweak the appearance +of the lines after plotting. In this case a dict containing the Lines +making up the boxes, caps, fliers, medians, and whiskers is returned. + +Examples +-------- + +Boxplots can be created for every column in the dataframe +by ``df.boxplot()`` or indicating the columns to be used: + +.. plot:: + :context: close-figs + + >>> np.random.seed(1234) + >>> df = pd.DataFrame(np.random.randn(10, 4), + ... columns=['Col1', 'Col2', 'Col3', 'Col4']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) # doctest: +SKIP + +Boxplots of variables distributions grouped by the values of a third +variable can be created using the option ``by``. For instance: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 2), + ... columns=['Col1', 'Col2']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> boxplot = df.boxplot(by='X') + +A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot +in order to group the data by combination of the variables in the x-axis: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 3), + ... columns=['Col1', 'Col2', 'Col3']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', + ... 'B', 'A', 'B', 'A', 'B']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + +The layout of boxplot can be adjusted giving a tuple to ``layout``: + +.. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... layout=(2, 1)) + +Additional formatting can be done to the boxplot, like suppressing the grid +(``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) +or changing the fontsize (i.e. ``fontsize=15``): + +.. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) # doctest: +SKIP + +The parameter ``return_type`` can be used to select the type of element +returned by `boxplot`. When ``return_type='axes'`` is selected, +the matplotlib axes on which the boxplot is drawn are returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes') + >>> type(boxplot) + + +When grouping with ``by``, a Series mapping columns to ``return_type`` +is returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type='axes') + >>> type(boxplot) + + +If ``return_type`` is `None`, a NumPy array of axes with the same shape +as ``layout`` is returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type=None) + >>> type(boxplot) + +""" + +_backend_doc = """\ +backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. +""" + + +_bar_or_line_doc = """ + Parameters + ---------- + x : label or position, optional + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y : label or position, optional + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + color : str, array-like, or dict, optional + The color for each of the DataFrame's columns. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each column recursively. For + instance ['green','yellow'] each column's %(kind)s will be filled in + green or yellow, alternatively. If there is only a single column to + be plotted, then only the first color from the color list will be + used. + + - A dict of the form {column name : color}, so that each column will be + colored accordingly. For example, if your columns are called `a` and + `b`, then passing {'a': 'green', 'b': 'red'} will color %(kind)ss for + column `a` in green and %(kind)ss for column `b` in red. + + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. +""" + + +@Substitution(data="data : DataFrame\n The data to visualize.\n", backend="") +@Appender(_boxplot_doc) +def boxplot( + data: DataFrame, + column: str | list[str] | None = None, + by: str | list[str] | None = None, + ax: Axes | None = None, + fontsize: float | str | None = None, + rot: int = 0, + grid: bool = True, + figsize: tuple[float, float] | None = None, + layout: tuple[int, int] | None = None, + return_type: str | None = None, + **kwargs, +): + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.boxplot( + data, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwargs, + ) + + +@Substitution(data="", backend=_backend_doc) +@Appender(_boxplot_doc) +def boxplot_frame( + self: DataFrame, + column=None, + by=None, + ax=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + figsize: tuple[float, float] | None = None, + layout=None, + return_type=None, + backend=None, + **kwargs, +): + plot_backend = _get_plot_backend(backend) + return plot_backend.boxplot_frame( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwargs, + ) + + +def boxplot_frame_groupby( + grouped: DataFrameGroupBy, + subplots: bool = True, + column=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + ax=None, + figsize: tuple[float, float] | None = None, + layout=None, + sharex: bool = False, + sharey: bool = True, + backend=None, + **kwargs, +): + """ + Make box plots from DataFrameGroupBy data. + + Parameters + ---------- + grouped : Grouped DataFrame + subplots : bool + * ``False`` - no subplots will be used + * ``True`` - create a subplot for each group. + + column : column name or list of names, or vector + Can be any valid input to groupby. + fontsize : float or str + rot : label rotation angle + grid : Setting this to True will show the grid + ax : Matplotlib axis object, default None + figsize : A tuple (width, height) in inches + layout : tuple (optional) + The layout of the plot: (rows, columns). + sharex : bool, default False + Whether x-axes will be shared among subplots. + sharey : bool, default True + Whether y-axes will be shared among subplots. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + **kwargs + All other plotting keyword arguments to be passed to + matplotlib's boxplot function. + + Returns + ------- + dict of key/value = group key/DataFrame.boxplot return value + or DataFrame.boxplot return value in case subplots=figures=False + + Examples + -------- + You can create boxplots for grouped data and show them as separate subplots: + + .. plot:: + :context: close-figs + + >>> import itertools + >>> tuples = [t for t in itertools.product(range(1000), range(4))] + >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> data = np.random.randn(len(index), 4) + >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) + >>> grouped = df.groupby(level='lvl1') + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP + + The ``subplots=False`` option shows the boxplots in a single figure. + + .. plot:: + :context: close-figs + + >>> grouped.boxplot(subplots=False, rot=45, fontsize=12) # doctest: +SKIP + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.boxplot_frame_groupby( + grouped, + subplots=subplots, + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + sharex=sharex, + sharey=sharey, + **kwargs, + ) + + +class PlotAccessor(PandasObject): + """ + Make plots of Series or DataFrame. + + Uses the backend specified by the + option ``plotting.backend``. By default, matplotlib is used. + + Parameters + ---------- + data : Series or DataFrame + The object for which the method is called. + x : label or position, default None + Only used if data is a DataFrame. + y : label, position or list of label, positions, default None + Allows plotting of one column versus another. Only used if data is a + DataFrame. + kind : str + The kind of plot to produce: + + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + - 'scatter' : scatter plot (DataFrame only) + - 'hexbin' : hexbin plot (DataFrame only) + ax : matplotlib axes object, default None + An axes of the current figure. + subplots : bool or sequence of iterables, default False + Whether to group columns into subplots: + + - ``False`` : No subplots will be used + - ``True`` : Make separate subplots for each column. + - sequence of iterables of column labels: Create a subplot for each + group of columns. For example `[('a', 'c'), ('b', 'd')]` will + create 2 subplots: one with columns 'a' and 'c', and one + with columns 'b' and 'd'. Remaining columns that aren't specified + will be plotted in additional subplots (one per column). + + .. versionadded:: 1.5.0 + + sharex : bool, default True if ax is None else False + In case ``subplots=True``, share x axis and set some x axis labels + to invisible; defaults to True if ax is None otherwise False if + an ax is passed in; Be aware, that passing in both an ax and + ``sharex=True`` will alter all x axis labels for all axis in a figure. + sharey : bool, default False + In case ``subplots=True``, share y axis and set some y axis labels to invisible. + layout : tuple, optional + (rows, columns) for the layout of subplots. + figsize : a tuple (width, height) in inches + Size of a figure object. + use_index : bool, default True + Use index as ticks for x axis. + title : str or list + Title to use for the plot. If a string is passed, print the string + at the top of the figure. If a list is passed and `subplots` is + True, print each item in the list above the corresponding subplot. + grid : bool, default None (matlab style default) + Axis grid lines. + legend : bool or {'reverse'} + Place legend on axis subplots. + style : list or dict + The matplotlib line style per column. + logx : bool or 'sym', default False + Use log scaling or symlog scaling on x axis. + + logy : bool or 'sym' default False + Use log scaling or symlog scaling on y axis. + + loglog : bool or 'sym', default False + Use log scaling or symlog scaling on both x and y axes. + + xticks : sequence + Values to use for the xticks. + yticks : sequence + Values to use for the yticks. + xlim : 2-tuple/list + Set the x limits of the current axes. + ylim : 2-tuple/list + Set the y limits of the current axes. + xlabel : label, optional + Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the + x-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + ylabel : label, optional + Name to use for the ylabel on y-axis. Default will show no ylabel, or the + y-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + rot : float, default None + Rotation for ticks (xticks for vertical, yticks for horizontal + plots). + fontsize : float, default None + Font size for xticks and yticks. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + colorbar : bool, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' + plots). + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center). + table : bool, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data + will be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a + table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for + detail. + xerr : DataFrame, Series, array-like, dict and str + Equivalent to yerr. + stacked : bool, default False in line and bar plots, and True in area plot + If True, create stacked plot. + secondary_y : bool or sequence, default False + Whether to plot on the secondary y-axis if a list/tuple, which + columns to plot on secondary y-axis. + mark_right : bool, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend. + include_bool : bool, default is False + If True, boolean values can be plotted. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + If the backend is not the default matplotlib one, the return value + will be the object returned by the backend. + + Notes + ----- + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> ser = pd.Series([1, 2, 3, 3]) + >>> plot = ser.plot(kind='hist', title="My plot") + + For DataFrame: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]}, + ... index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> plot = df.plot(title="DataFrame Plot") + + For SeriesGroupBy: + + .. plot:: + :context: close-figs + + >>> lst = [-1, -2, -3, 1, 2, 3] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> plot = ser.groupby(lambda x: x > 0).plot(title="SeriesGroupBy Plot") + + For DataFrameGroupBy: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({"col1" : [1, 2, 3, 4], + ... "col2" : ["A", "B", "A", "B"]}) + >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") + """ + + _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") + _series_kinds = ("pie",) + _dataframe_kinds = ("scatter", "hexbin") + _kind_aliases = {"density": "kde"} + _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds + + def __init__(self, data: Series | DataFrame) -> None: + self._parent = data + + @staticmethod + def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs): + """ + This function makes calls to this accessor `__call__` method compatible + with the previous `SeriesPlotMethods.__call__` and + `DataFramePlotMethods.__call__`. Those had slightly different + signatures, since `DataFramePlotMethods` accepted `x` and `y` + parameters. + """ + if isinstance(data, ABCSeries): + arg_def = [ + ("kind", "line"), + ("ax", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", False), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("label", None), + ("secondary_y", False), + ("xlabel", None), + ("ylabel", None), + ] + elif isinstance(data, ABCDataFrame): + arg_def = [ + ("x", None), + ("y", None), + ("kind", "line"), + ("ax", None), + ("subplots", False), + ("sharex", None), + ("sharey", False), + ("layout", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", True), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("secondary_y", False), + ("xlabel", None), + ("ylabel", None), + ] + else: + raise TypeError( + f"Called plot accessor for type {type(data).__name__}, " + "expected Series or DataFrame" + ) + + if args and isinstance(data, ABCSeries): + positional_args = str(args)[1:-1] + keyword_args = ", ".join( + [f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)] + ) + msg = ( + "`Series.plot()` should not be called with positional " + "arguments, only keyword arguments. The order of " + "positional arguments will change in the future. " + f"Use `Series.plot({keyword_args})` instead of " + f"`Series.plot({positional_args})`." + ) + raise TypeError(msg) + + pos_args = {name: value for (name, _), value in zip(arg_def, args)} + if backend_name == "pandas.plotting._matplotlib": + kwargs = dict(arg_def, **pos_args, **kwargs) + else: + kwargs = dict(pos_args, **kwargs) + + x = kwargs.pop("x", None) + y = kwargs.pop("y", None) + kind = kwargs.pop("kind", "line") + return x, y, kind, kwargs + + def __call__(self, *args, **kwargs): + plot_backend = _get_plot_backend(kwargs.pop("backend", None)) + + x, y, kind, kwargs = self._get_call_args( + plot_backend.__name__, self._parent, args, kwargs + ) + + kind = self._kind_aliases.get(kind, kind) + + # when using another backend, get out of the way + if plot_backend.__name__ != "pandas.plotting._matplotlib": + return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) + + if kind not in self._all_kinds: + raise ValueError( + f"{kind} is not a valid plot kind " + f"Valid plot kinds: {self._all_kinds}" + ) + + # The original data structured can be transformed before passed to the + # backend. For example, for DataFrame is common to set the index as the + # `x` parameter, and return a Series with the parameter `y` as values. + data = self._parent.copy() + + if isinstance(data, ABCSeries): + kwargs["reuse_plot"] = True + + if kind in self._dataframe_kinds: + if isinstance(data, ABCDataFrame): + return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) + else: + raise ValueError(f"plot kind {kind} can only be used for data frames") + elif kind in self._series_kinds: + if isinstance(data, ABCDataFrame): + if y is None and kwargs.get("subplots") is False: + raise ValueError( + f"{kind} requires either y column or 'subplots=True'" + ) + if y is not None: + if is_integer(y) and not data.columns._holds_integer(): + y = data.columns[y] + # converted to series actually. copy to not modify + data = data[y].copy() + data.index.name = y + elif isinstance(data, ABCDataFrame): + data_cols = data.columns + if x is not None: + if is_integer(x) and not data.columns._holds_integer(): + x = data_cols[x] + elif not isinstance(data[x], ABCSeries): + raise ValueError("x must be a label or position") + data = data.set_index(x) + if y is not None: + # check if we have y as int or list of ints + int_ylist = is_list_like(y) and all(is_integer(c) for c in y) + int_y_arg = is_integer(y) or int_ylist + if int_y_arg and not data.columns._holds_integer(): + y = data_cols[y] + + label_kw = kwargs["label"] if "label" in kwargs else False + for kw in ["xerr", "yerr"]: + if kw in kwargs and ( + isinstance(kwargs[kw], str) or is_integer(kwargs[kw]) + ): + try: + kwargs[kw] = data[kwargs[kw]] + except (IndexError, KeyError, TypeError): + pass + + # don't overwrite + data = data[y].copy() + + if isinstance(data, ABCSeries): + label_name = label_kw or y + data.name = label_name + else: + match = is_list_like(label_kw) and len(label_kw) == len(y) + if label_kw and not match: + raise ValueError( + "label should be list-like and same length as y" + ) + label_name = label_kw or data.columns + data.columns = label_name + + return plot_backend.plot(data, kind=kind, **kwargs) + + __call__.__doc__ = __doc__ + + @Appender( + """ + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 3, 2]) + >>> s.plot.line() # doctest: +SKIP + + .. plot:: + :context: close-figs + + The following example shows the populations for some animals + over the years. + + >>> df = pd.DataFrame({ + ... 'pig': [20, 18, 489, 675, 1776], + ... 'horse': [4, 25, 281, 600, 1900] + ... }, index=[1990, 1997, 2003, 2009, 2014]) + >>> lines = df.plot.line() + + .. plot:: + :context: close-figs + + An example with subplots, so an array of axes is returned. + + >>> axes = df.plot.line(subplots=True) + >>> type(axes) + + + .. plot:: + :context: close-figs + + Let's repeat the same example, but specifying colors for + each column (in this case, for each animal). + + >>> axes = df.plot.line( + ... subplots=True, color={"pig": "pink", "horse": "#742802"} + ... ) + + .. plot:: + :context: close-figs + + The following example shows the relationship between both + populations. + + >>> lines = df.plot.line(x='pig', y='horse') + """ + ) + @Substitution(kind="line") + @Appender(_bar_or_line_doc) + def line( + self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + ) -> PlotAccessor: + """ + Plot Series or DataFrame as lines. + + This function is useful to plot lines using DataFrame's values + as coordinates. + """ + return self(kind="line", x=x, y=y, **kwargs) + + @Appender( + """ + See Also + -------- + DataFrame.plot.barh : Horizontal bar plot. + DataFrame.plot : Make plots of a DataFrame. + matplotlib.pyplot.bar : Make a bar plot with matplotlib. + + Examples + -------- + Basic plot. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> ax = df.plot.bar(x='lab', y='val', rot=0) + + Plot a whole dataframe to a bar plot. Each column is assigned a + distinct color, and each row is nested in a group along the + horizontal axis. + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.bar(rot=0) + + Plot stacked bar charts for the DataFrame + + .. plot:: + :context: close-figs + + >>> ax = df.plot.bar(stacked=True) + + Instead of nesting, the figure can be split by column with + ``subplots=True``. In this case, a :class:`numpy.ndarray` of + :class:`matplotlib.axes.Axes` are returned. + + .. plot:: + :context: close-figs + + >>> axes = df.plot.bar(rot=0, subplots=True) + >>> axes[1].legend(loc=2) # doctest: +SKIP + + If you don't like the default colours, you can specify how you'd + like each column to be colored. + + .. plot:: + :context: close-figs + + >>> axes = df.plot.bar( + ... rot=0, subplots=True, color={"speed": "red", "lifespan": "green"} + ... ) + >>> axes[1].legend(loc=2) # doctest: +SKIP + + Plot a single column. + + .. plot:: + :context: close-figs + + >>> ax = df.plot.bar(y='speed', rot=0) + + Plot only selected categories for the DataFrame. + + .. plot:: + :context: close-figs + + >>> ax = df.plot.bar(x='lifespan', rot=0) + """ + ) + @Substitution(kind="bar") + @Appender(_bar_or_line_doc) + def bar( # pylint: disable=disallowed-name + self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + ) -> PlotAccessor: + """ + Vertical bar plot. + + A bar plot is a plot that presents categorical data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. + """ + return self(kind="bar", x=x, y=y, **kwargs) + + @Appender( + """ + See Also + -------- + DataFrame.plot.bar: Vertical bar plot. + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib. + + Examples + -------- + Basic example + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) + >>> ax = df.plot.barh(x='lab', y='val') + + Plot a whole DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh() + + Plot stacked barh charts for the DataFrame + + .. plot:: + :context: close-figs + + >>> ax = df.plot.barh(stacked=True) + + We can specify colors for each column + + .. plot:: + :context: close-figs + + >>> ax = df.plot.barh(color={"speed": "red", "lifespan": "green"}) + + Plot a column of the DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(y='speed') + + Plot DataFrame versus the desired column + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(x='lifespan') + """ + ) + @Substitution(kind="bar") + @Appender(_bar_or_line_doc) + def barh( + self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + ) -> PlotAccessor: + """ + Make a horizontal bar plot. + + A horizontal bar plot is a plot that presents quantitative data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. + """ + return self(kind="barh", x=x, y=y, **kwargs) + + def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: + r""" + Make a box plot of the DataFrame columns. + + A box plot is a method for graphically depicting groups of numerical + data through their quartiles. + The box extends from the Q1 to Q3 quartile values of the data, + with a line at the median (Q2). The whiskers extend from the edges + of box to show the range of the data. The position of the whiskers + is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the + box. Outlier points are those past the end of the whiskers. + + For further details see Wikipedia's + entry for `boxplot `__. + + A consideration when using this chart is that the box and the whiskers + can overlap, which is very common when plotting small sets of data. + + Parameters + ---------- + by : str or sequence + Column in the DataFrame to group by. + + .. versionchanged:: 1.4.0 + + Previously, `by` is silently ignore and makes no groupings + + **kwargs + Additional keywords are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + See Also + -------- + DataFrame.boxplot: Another method to draw a box plot. + Series.plot.box: Draw a box plot from a Series object. + matplotlib.pyplot.boxplot: Draw a box plot in matplotlib. + + Examples + -------- + Draw a box plot from a DataFrame with four columns of randomly + generated data. + + .. plot:: + :context: close-figs + + >>> data = np.random.randn(25, 4) + >>> df = pd.DataFrame(data, columns=list('ABCD')) + >>> ax = df.plot.box() + + You can also generate groupings if you specify the `by` parameter (which + can take a column name, or a list or tuple of column names): + + .. versionchanged:: 1.4.0 + + .. plot:: + :context: close-figs + + >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] + >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) + >>> ax = df.plot.box(column="age", by="gender", figsize=(10, 8)) + """ + return self(kind="box", by=by, **kwargs) + + def hist( + self, by: IndexLabel | None = None, bins: int = 10, **kwargs + ) -> PlotAccessor: + """ + Draw one histogram of the DataFrame's columns. + + A histogram is a representation of the distribution of data. + This function groups the values of all given Series in the DataFrame + into bins and draws all bins in one :class:`matplotlib.axes.Axes`. + This is useful when the DataFrame's Series are in a similar scale. + + Parameters + ---------- + by : str or sequence, optional + Column in the DataFrame to group by. + + .. versionchanged:: 1.4.0 + + Previously, `by` is silently ignore and makes no groupings + + bins : int, default 10 + Number of histogram bins to be used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + class:`matplotlib.AxesSubplot` + Return a histogram plot. + + See Also + -------- + DataFrame.hist : Draw histograms per DataFrame's Series. + Series.hist : Draw a histogram with Series' data. + + Examples + -------- + When we roll a die 6000 times, we expect to get each value around 1000 + times. But when we roll two dice and sum the result, the distribution + is going to be quite different. A histogram illustrates those + distributions. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) + >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) + + A grouped histogram can be generated by providing the parameter `by` (which + can be a column name, or a list of column names): + + .. plot:: + :context: close-figs + + >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] + >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) + >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8)) + """ + return self(kind="hist", by=by, bins=bins, **kwargs) + + def kde( + self, + bw_method: Literal["scott", "silverman"] | float | Callable | None = None, + ind: np.ndarray | int | None = None, + **kwargs, + ) -> PlotAccessor: + """ + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwidth determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or int, optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or numpy.ndarray of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + + Examples + -------- + Given a Series of points randomly sampled from an unknown + distribution, estimate its PDF using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) + >>> ax = s.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to over-fitting, while using a large bandwidth value may result + in under-fitting: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) + + For DataFrame, it works in the same way: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], + ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], + ... }) + >>> ax = df.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to over-fitting, while using a large bandwidth value may result + in under-fitting: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) + """ + return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) + + density = kde + + def area( + self, + x: Hashable | None = None, + y: Hashable | None = None, + stacked: bool = True, + **kwargs, + ) -> PlotAccessor: + """ + Draw a stacked area plot. + + An area plot displays quantitative data visually. + This function wraps the matplotlib area function. + + Parameters + ---------- + x : label or position, optional + Coordinates for the X axis. By default uses the index. + y : label or position, optional + Column to plot. By default uses all columns. + stacked : bool, default True + Area plots are stacked by default. Set to False to create a + unstacked plot. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or numpy.ndarray + Area plot, or array of area plots if subplots is True. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + + Examples + -------- + Draw an area plot based on basic business metrics: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'sales': [3, 2, 3, 9, 10, 6], + ... 'signups': [5, 5, 6, 12, 14, 13], + ... 'visits': [20, 42, 28, 62, 81, 50], + ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', + ... freq='ME')) + >>> ax = df.plot.area() + + Area plots are stacked by default. To produce an unstacked plot, + pass ``stacked=False``: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.area(stacked=False) + + Draw an area plot for a single column: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.area(y='sales') + + Draw with a different `x`: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'sales': [3, 2, 3], + ... 'visits': [20, 42, 28], + ... 'day': [1, 2, 3], + ... }) + >>> ax = df.plot.area(x='day') + """ + return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) + + def pie(self, **kwargs) -> PlotAccessor: + """ + Generate a pie plot. + + A pie plot is a proportional representation of the numerical data in a + column. This function wraps :meth:`matplotlib.pyplot.pie` for the + specified column. If no column reference is passed and + ``subplots=True`` a pie plot is drawn for each numerical column + independently. + + Parameters + ---------- + y : int or label, optional + Label or position of the column to plot. + If not provided, ``subplots=True`` argument must be passed. + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + A NumPy array is returned when `subplots` is True. + + See Also + -------- + Series.plot.pie : Generate a pie plot for a Series. + DataFrame.plot : Make plots of a DataFrame. + + Examples + -------- + In the example below we have a DataFrame with the information about + planet's mass and radius. We pass the 'mass' column to the + pie function to get a pie plot. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97], + ... 'radius': [2439.7, 6051.8, 6378.1]}, + ... index=['Mercury', 'Venus', 'Earth']) + >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + + .. plot:: + :context: close-figs + + >>> plot = df.plot.pie(subplots=True, figsize=(11, 6)) + """ + if ( + isinstance(self._parent, ABCDataFrame) + and kwargs.get("y", None) is None + and not kwargs.get("subplots", False) + ): + raise ValueError("pie requires either y column or 'subplots=True'") + return self(kind="pie", **kwargs) + + def scatter( + self, + x: Hashable, + y: Hashable, + s: Hashable | Sequence[Hashable] | None = None, + c: Hashable | Sequence[Hashable] | None = None, + **kwargs, + ) -> PlotAccessor: + """ + Create a scatter plot with varying marker point size and color. + + The coordinates of each point are defined by two dataframe columns and + filled circles are used to represent each point. This kind of plot is + useful to see complex correlations between two variables. Points could + be for instance natural 2D coordinates like longitude and latitude in + a map or, in general, any pair of metrics that can be plotted against + each other. + + Parameters + ---------- + x : int or str + The column name or column position to be used as horizontal + coordinates for each point. + y : int or str + The column name or column position to be used as vertical + coordinates for each point. + s : str, scalar or array-like, optional + The size of each point. Possible values are: + + - A string with the name of the column to be used for marker's size. + + - A single scalar so all points have the same size. + + - A sequence of scalars, which will be used for each point's size + recursively. For instance, when passing [2,14] all points size + will be either 2 or 14, alternatively. + + c : str, int or array-like, optional + The color of each point. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each point's color recursively. For + instance ['green','yellow'] all points will be filled in green or + yellow, alternatively. + + - A column name or position whose values will be used to color the + marker points according to a colormap. + + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + See Also + -------- + matplotlib.pyplot.scatter : Scatter plot using multiple input data + formats. + + Examples + -------- + Let's see how to draw a scatter plot using coordinates from the values + in a DataFrame's columns. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], + ... [6.4, 3.2, 1], [5.9, 3.0, 2]], + ... columns=['length', 'width', 'species']) + >>> ax1 = df.plot.scatter(x='length', + ... y='width', + ... c='DarkBlue') + + And now with the color determined by a column as well. + + .. plot:: + :context: close-figs + + >>> ax2 = df.plot.scatter(x='length', + ... y='width', + ... c='species', + ... colormap='viridis') + """ + return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) + + def hexbin( + self, + x: Hashable, + y: Hashable, + C: Hashable | None = None, + reduce_C_function: Callable | None = None, + gridsize: int | tuple[int, int] | None = None, + **kwargs, + ) -> PlotAccessor: + """ + Generate a hexagonal binning plot. + + Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None` + (the default), this is a histogram of the number of occurrences + of the observations at ``(x[i], y[i])``. + + If `C` is specified, specifies values at given coordinates + ``(x[i], y[i])``. These values are accumulated for each hexagonal + bin and then reduced according to `reduce_C_function`, + having as default the NumPy's mean function (:meth:`numpy.mean`). + (If `C` is specified, it must also be a 1-D sequence + of the same length as `x` and `y`, or a column label.) + + Parameters + ---------- + x : int or str + The column label or position for x points. + y : int or str + The column label or position for y points. + C : int or str, optional + The column label or position for the value of `(x, y)` point. + reduce_C_function : callable, default `np.mean` + Function of one argument that reduces all the values in a bin to + a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`). + gridsize : int or tuple of (int, int), default 100 + The number of hexagons in the x-direction. + The corresponding number of hexagons in the y-direction is + chosen in a way that the hexagons are approximately regular. + Alternatively, gridsize can be a tuple with two elements + specifying the number of hexagons in the x-direction and the + y-direction. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.AxesSubplot + The matplotlib ``Axes`` on which the hexbin is plotted. + + See Also + -------- + DataFrame.plot : Make plots of a DataFrame. + matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib, + the matplotlib function that is used under the hood. + + Examples + -------- + The following examples are generated with random data from + a normal distribution. + + .. plot:: + :context: close-figs + + >>> n = 10000 + >>> df = pd.DataFrame({'x': np.random.randn(n), + ... 'y': np.random.randn(n)}) + >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20) + + The next example uses `C` and `np.sum` as `reduce_C_function`. + Note that `'observations'` values ranges from 1 to 5 but the result + plot shows values up to more than 25. This is because of the + `reduce_C_function`. + + .. plot:: + :context: close-figs + + >>> n = 500 + >>> df = pd.DataFrame({ + ... 'coord_x': np.random.uniform(-3, 3, size=n), + ... 'coord_y': np.random.uniform(30, 50, size=n), + ... 'observations': np.random.randint(1,5, size=n) + ... }) + >>> ax = df.plot.hexbin(x='coord_x', + ... y='coord_y', + ... C='observations', + ... reduce_C_function=np.sum, + ... gridsize=10, + ... cmap="viridis") + """ + if reduce_C_function is not None: + kwargs["reduce_C_function"] = reduce_C_function + if gridsize is not None: + kwargs["gridsize"] = gridsize + + return self(kind="hexbin", x=x, y=y, C=C, **kwargs) + + +_backends: dict[str, types.ModuleType] = {} + + +def _load_backend(backend: str) -> types.ModuleType: + """ + Load a pandas plotting backend. + + Parameters + ---------- + backend : str + The identifier for the backend. Either an entrypoint item registered + with importlib.metadata, "matplotlib", or a module name. + + Returns + ------- + types.ModuleType + The imported backend. + """ + from importlib.metadata import entry_points + + if backend == "matplotlib": + # Because matplotlib is an optional dependency and first-party backend, + # we need to attempt an import here to raise an ImportError if needed. + try: + module = importlib.import_module("pandas.plotting._matplotlib") + except ImportError: + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) from None + return module + + found_backend = False + + eps = entry_points() + key = "pandas_plotting_backends" + # entry_points lost dict API ~ PY 3.10 + # https://github.com/python/importlib_metadata/issues/298 + if hasattr(eps, "select"): + entry = eps.select(group=key) + else: + # Argument 2 to "get" of "dict" has incompatible type "Tuple[]"; + # expected "EntryPoints" [arg-type] + entry = eps.get(key, ()) # type: ignore[arg-type] + for entry_point in entry: + found_backend = entry_point.name == backend + if found_backend: + module = entry_point.load() + break + + if not found_backend: + # Fall back to unregistered, module name approach. + try: + module = importlib.import_module(backend) + found_backend = True + except ImportError: + # We re-raise later on. + pass + + if found_backend: + if hasattr(module, "plot"): + # Validate that the interface is implemented when the option is set, + # rather than at plot time. + return module + + raise ValueError( + f"Could not find plotting backend '{backend}'. Ensure that you've " + f"installed the package providing the '{backend}' entrypoint, or that " + "the package has a top-level `.plot` method." + ) + + +def _get_plot_backend(backend: str | None = None): + """ + Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`). + + The plotting system of pandas uses matplotlib by default, but the idea here + is that it can also work with other third-party backends. This function + returns the module which provides a top-level `.plot` method that will + actually do the plotting. The backend is specified from a string, which + either comes from the keyword argument `backend`, or, if not specified, from + the option `pandas.options.plotting.backend`. All the rest of the code in + this file uses the backend specified there for the plotting. + + The backend is imported lazily, as matplotlib is a soft dependency, and + pandas can be used without it being installed. + + Notes + ----- + Modifies `_backends` with imported backend as a side effect. + """ + backend_str: str = backend or get_option("plotting.backend") + + if backend_str in _backends: + return _backends[backend_str] + + module = _load_backend(backend_str) + _backends[backend_str] = module + return module diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_misc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..18db460d388a4b748f91282ae42875206ba36cc6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_misc.py @@ -0,0 +1,688 @@ +from __future__ import annotations + +from contextlib import contextmanager +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.plotting._core import _get_plot_backend + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Mapping, + ) + + from matplotlib.axes import Axes + from matplotlib.colors import Colormap + from matplotlib.figure import Figure + from matplotlib.table import Table + import numpy as np + + from pandas import ( + DataFrame, + Series, + ) + + +def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: + """ + Helper function to convert DataFrame and Series to matplotlib.table. + + Parameters + ---------- + ax : Matplotlib axes object + data : DataFrame or Series + Data for table contents. + **kwargs + Keyword arguments to be passed to matplotlib.table.table. + If `rowLabels` or `colLabels` is not specified, data index or column + name will be used. + + Returns + ------- + matplotlib table object + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> import matplotlib.pyplot as plt + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> fix, ax = plt.subplots() + >>> ax.axis('off') + (0.0, 1.0, 0.0, 1.0) + >>> table = pd.plotting.table(ax, df, loc='center', + ... cellLoc='center', colWidths=list([.2, .2])) + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.table( + ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs + ) + + +def register() -> None: + """ + Register pandas formatters and converters with matplotlib. + + This function modifies the global ``matplotlib.units.registry`` + dictionary. pandas adds custom converters for + + * pd.Timestamp + * pd.Period + * np.datetime64 + * datetime.datetime + * datetime.date + * datetime.time + + See Also + -------- + deregister_matplotlib_converters : Remove pandas formatters and converters. + + Examples + -------- + .. plot:: + :context: close-figs + + The following line is done automatically by pandas so + the plot can be rendered: + + >>> pd.plotting.register_matplotlib_converters() + + >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), + ... 'y': [1, 2] + ... }) + >>> plot = df.plot.line(x='ts', y='y') + + Unsetting the register manually an error will be raised: + + >>> pd.set_option("plotting.matplotlib.register_converters", + ... False) # doctest: +SKIP + >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + Traceback (most recent call last): + TypeError: float() argument must be a string or a real number, not 'Period' + """ + plot_backend = _get_plot_backend("matplotlib") + plot_backend.register() + + +def deregister() -> None: + """ + Remove pandas formatters and converters. + + Removes the custom converters added by :func:`register`. This + attempts to set the state of the registry back to the state before + pandas registered its own units. Converters for pandas' own types like + Timestamp and Period are removed completely. Converters for types + pandas overwrites, like ``datetime.datetime``, are restored to their + original value. + + See Also + -------- + register_matplotlib_converters : Register pandas formatters and converters + with matplotlib. + + Examples + -------- + .. plot:: + :context: close-figs + + The following line is done automatically by pandas so + the plot can be rendered: + + >>> pd.plotting.register_matplotlib_converters() + + >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), + ... 'y': [1, 2] + ... }) + >>> plot = df.plot.line(x='ts', y='y') + + Unsetting the register manually an error will be raised: + + >>> pd.set_option("plotting.matplotlib.register_converters", + ... False) # doctest: +SKIP + >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + Traceback (most recent call last): + TypeError: float() argument must be a string or a real number, not 'Period' + """ + plot_backend = _get_plot_backend("matplotlib") + plot_backend.deregister() + + +def scatter_matrix( + frame: DataFrame, + alpha: float = 0.5, + figsize: tuple[float, float] | None = None, + ax: Axes | None = None, + grid: bool = False, + diagonal: str = "hist", + marker: str = ".", + density_kwds: Mapping[str, Any] | None = None, + hist_kwds: Mapping[str, Any] | None = None, + range_padding: float = 0.05, + **kwargs, +) -> np.ndarray: + """ + Draw a matrix of scatter plots. + + Parameters + ---------- + frame : DataFrame + alpha : float, optional + Amount of transparency applied. + figsize : (float,float), optional + A tuple (width, height) in inches. + ax : Matplotlib axis object, optional + grid : bool, optional + Setting this to True will show the grid. + diagonal : {'hist', 'kde'} + Pick between 'kde' and 'hist' for either Kernel Density Estimation or + Histogram plot in the diagonal. + marker : str, optional + Matplotlib marker type, default '.'. + density_kwds : keywords + Keyword arguments to be passed to kernel density estimate plot. + hist_kwds : keywords + Keyword arguments to be passed to hist function. + range_padding : float, default 0.05 + Relative extension of axis range in x and y with respect to + (x_max - x_min) or (y_max - y_min). + **kwargs + Keyword arguments to be passed to scatter function. + + Returns + ------- + numpy.ndarray + A matrix of scatter plots. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> pd.plotting.scatter_matrix(df, alpha=0.2) + array([[, , + , ], + [, , + , ], + [, , + , ], + [, , + , ]], + dtype=object) + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.scatter_matrix( + frame=frame, + alpha=alpha, + figsize=figsize, + ax=ax, + grid=grid, + diagonal=diagonal, + marker=marker, + density_kwds=density_kwds, + hist_kwds=hist_kwds, + range_padding=range_padding, + **kwargs, + ) + + +def radviz( + frame: DataFrame, + class_column: str, + ax: Axes | None = None, + color: list[str] | tuple[str, ...] | None = None, + colormap: Colormap | str | None = None, + **kwds, +) -> Axes: + """ + Plot a multidimensional dataset in 2D. + + Each Series in the DataFrame is represented as a evenly distributed + slice on a circle. Each data point is rendered in the circle according to + the value on each Series. Highly correlated `Series` in the `DataFrame` + are placed closer on the unit circle. + + RadViz allow to project a N-dimensional data set into a 2D space where the + influence of each dimension can be interpreted as a balance between the + influence of all dimensions. + + More info available at the `original article + `_ + describing RadViz. + + Parameters + ---------- + frame : `DataFrame` + Object holding the data. + class_column : str + Column name containing the name of the data point category. + ax : :class:`matplotlib.axes.Axes`, optional + A plot instance to which to add the information. + color : list[str] or tuple[str], optional + Assign a color to each category. Example: ['blue', 'green']. + colormap : str or :class:`matplotlib.colors.Colormap`, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + **kwds + Options to pass to matplotlib scatter plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` + + See Also + -------- + pandas.plotting.andrews_curves : Plot clustering visualization. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame( + ... { + ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6], + ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6], + ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0], + ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2], + ... 'Category': [ + ... 'virginica', + ... 'virginica', + ... 'setosa', + ... 'virginica', + ... 'virginica', + ... 'versicolor', + ... 'versicolor', + ... 'setosa', + ... 'virginica', + ... 'setosa' + ... ] + ... } + ... ) + >>> pd.plotting.radviz(df, 'Category') # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.radviz( + frame=frame, + class_column=class_column, + ax=ax, + color=color, + colormap=colormap, + **kwds, + ) + + +def andrews_curves( + frame: DataFrame, + class_column: str, + ax: Axes | None = None, + samples: int = 200, + color: list[str] | tuple[str, ...] | None = None, + colormap: Colormap | str | None = None, + **kwargs, +) -> Axes: + """ + Generate a matplotlib plot for visualizing clusters of multivariate data. + + Andrews curves have the functional form: + + .. math:: + f(t) = \\frac{x_1}{\\sqrt{2}} + x_2 \\sin(t) + x_3 \\cos(t) + + x_4 \\sin(2t) + x_5 \\cos(2t) + \\cdots + + Where :math:`x` coefficients correspond to the values of each dimension + and :math:`t` is linearly spaced between :math:`-\\pi` and :math:`+\\pi`. + Each row of frame then corresponds to a single curve. + + Parameters + ---------- + frame : DataFrame + Data to be plotted, preferably normalized to (0.0, 1.0). + class_column : label + Name of the column containing class names. + ax : axes object, default None + Axes to use. + samples : int + Number of points to plot in each curve. + color : str, list[str] or tuple[str], optional + Colors to use for the different classes. Colors can be strings + or 3-element floating point RGB values. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If a string, load colormap with that + name from matplotlib. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> df = pd.read_csv( + ... 'https://raw.githubusercontent.com/pandas-dev/' + ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' + ... ) + >>> pd.plotting.andrews_curves(df, 'Name') # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.andrews_curves( + frame=frame, + class_column=class_column, + ax=ax, + samples=samples, + color=color, + colormap=colormap, + **kwargs, + ) + + +def bootstrap_plot( + series: Series, + fig: Figure | None = None, + size: int = 50, + samples: int = 500, + **kwds, +) -> Figure: + """ + Bootstrap plot on mean, median and mid-range statistics. + + The bootstrap plot is used to estimate the uncertainty of a statistic + by relying on random sampling with replacement [1]_. This function will + generate bootstrapping plots for mean, median and mid-range statistics + for the given number of samples of the given size. + + .. [1] "Bootstrapping (statistics)" in \ + https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 + + Parameters + ---------- + series : pandas.Series + Series from where to get the samplings for the bootstrapping. + fig : matplotlib.figure.Figure, default None + If given, it will use the `fig` reference for plotting instead of + creating a new one with default parameters. + size : int, default 50 + Number of data points to consider during each sampling. It must be + less than or equal to the length of the `series`. + samples : int, default 500 + Number of times the bootstrap procedure is performed. + **kwds + Options to pass to matplotlib plotting method. + + Returns + ------- + matplotlib.figure.Figure + Matplotlib figure. + + See Also + -------- + pandas.DataFrame.plot : Basic plotting for DataFrame objects. + pandas.Series.plot : Basic plotting for Series objects. + + Examples + -------- + This example draws a basic bootstrap plot for a Series. + + .. plot:: + :context: close-figs + + >>> s = pd.Series(np.random.uniform(size=100)) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP +
+ """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.bootstrap_plot( + series=series, fig=fig, size=size, samples=samples, **kwds + ) + + +def parallel_coordinates( + frame: DataFrame, + class_column: str, + cols: list[str] | None = None, + ax: Axes | None = None, + color: list[str] | tuple[str, ...] | None = None, + use_columns: bool = False, + xticks: list | tuple | None = None, + colormap: Colormap | str | None = None, + axvlines: bool = True, + axvlines_kwds: Mapping[str, Any] | None = None, + sort_labels: bool = False, + **kwargs, +) -> Axes: + """ + Parallel coordinates plotting. + + Parameters + ---------- + frame : DataFrame + class_column : str + Column name containing class names. + cols : list, optional + A list of column names to use. + ax : matplotlib.axis, optional + Matplotlib axis object. + color : list or tuple, optional + Colors to use for the different classes. + use_columns : bool, optional + If true, columns will be used as xticks. + xticks : list or tuple, optional + A list of values to use for xticks. + colormap : str or matplotlib colormap, default None + Colormap to use for line colors. + axvlines : bool, optional + If true, vertical lines will be added at each xtick. + axvlines_kwds : keywords, optional + Options to be passed to axvline method for vertical lines. + sort_labels : bool, default False + Sort class_column labels, useful when assigning colors. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + matplotlib.axes.Axes + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> df = pd.read_csv( + ... 'https://raw.githubusercontent.com/pandas-dev/' + ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' + ... ) + >>> pd.plotting.parallel_coordinates( + ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464') + ... ) # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.parallel_coordinates( + frame=frame, + class_column=class_column, + cols=cols, + ax=ax, + color=color, + use_columns=use_columns, + xticks=xticks, + colormap=colormap, + axvlines=axvlines, + axvlines_kwds=axvlines_kwds, + sort_labels=sort_labels, + **kwargs, + ) + + +def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes: + """ + Lag plot for time series. + + Parameters + ---------- + series : Series + The time series to visualize. + lag : int, default 1 + Lag length of the scatter plot. + ax : Matplotlib axis object, optional + The matplotlib axis object to use. + **kwds + Matplotlib scatter method keyword arguments. + + Returns + ------- + matplotlib.axes.Axes + + Examples + -------- + Lag plots are most commonly used to look for patterns in time series data. + + Given the following time series + + .. plot:: + :context: close-figs + + >>> np.random.seed(5) + >>> x = np.cumsum(np.random.normal(loc=1, scale=5, size=50)) + >>> s = pd.Series(x) + >>> s.plot() # doctest: +SKIP + + A lag plot with ``lag=1`` returns + + .. plot:: + :context: close-figs + + >>> pd.plotting.lag_plot(s, lag=1) + + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds) + + +def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Axes: + """ + Autocorrelation plot for time series. + + Parameters + ---------- + series : Series + The time series to visualize. + ax : Matplotlib axis object, optional + The matplotlib axis object to use. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + matplotlib.axes.Axes + + Examples + -------- + The horizontal lines in the plot correspond to 95% and 99% confidence bands. + + The dashed line is 99% confidence band. + + .. plot:: + :context: close-figs + + >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) + >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) + >>> pd.plotting.autocorrelation_plot(s) # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) + + +class _Options(dict): + """ + Stores pandas plotting options. + + Allows for parameter aliasing so you can just use parameter names that are + the same as the plot function parameters, but is stored in a canonical + format that makes it easy to breakdown into groups later. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> np.random.seed(42) + >>> df = pd.DataFrame({'A': np.random.randn(10), + ... 'B': np.random.randn(10)}, + ... index=pd.date_range("1/1/2000", + ... freq='4MS', periods=10)) + >>> with pd.plotting.plot_params.use("x_compat", True): + ... _ = df["A"].plot(color="r") + ... _ = df["B"].plot(color="g") + """ + + # alias so the names are same as plotting method parameter names + _ALIASES = {"x_compat": "xaxis.compat"} + _DEFAULT_KEYS = ["xaxis.compat"] + + def __init__(self, deprecated: bool = False) -> None: + self._deprecated = deprecated + super().__setitem__("xaxis.compat", False) + + def __getitem__(self, key): + key = self._get_canonical_key(key) + if key not in self: + raise ValueError(f"{key} is not a valid pandas plotting option") + return super().__getitem__(key) + + def __setitem__(self, key, value) -> None: + key = self._get_canonical_key(key) + super().__setitem__(key, value) + + def __delitem__(self, key) -> None: + key = self._get_canonical_key(key) + if key in self._DEFAULT_KEYS: + raise ValueError(f"Cannot remove default parameter {key}") + super().__delitem__(key) + + def __contains__(self, key) -> bool: + key = self._get_canonical_key(key) + return super().__contains__(key) + + def reset(self) -> None: + """ + Reset the option store to its initial state + + Returns + ------- + None + """ + # error: Cannot access "__init__" directly + self.__init__() # type: ignore[misc] + + def _get_canonical_key(self, key): + return self._ALIASES.get(key, key) + + @contextmanager + def use(self, key, value) -> Generator[_Options, None, None]: + """ + Temporarily set a parameter value using the with statement. + Aliasing allowed. + """ + old_value = self[key] + try: + self[key] = value + yield self + finally: + self[key] = old_value + + +plot_params = _Options() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b35a940bbcc86266e33d08d48e7e282a2d2eabd7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f533f20f4dc9a81ede985aa6010b6770f37ea4e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..475e1c5d91ea8fba70d5b4d80637c1285ecb5a54 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/api/__pycache__/test_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63df65a35c0050aeb3d01759115ac171d2e9c3ad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c87df87cddf3f27a375c832655b301cc474a468 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..884f6223426c5998a1b3ad01e69b271a3c362935 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply_relabeling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply_relabeling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6634c4e5c10e7bac9c761c3262d7ccf3d8905376 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_apply_relabeling.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_transform.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_transform.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3482cf156c16b504aab6381c2a67c19a8b2a1f13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_frame_transform.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_invalid_arg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_invalid_arg.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d61b4de24aca4e39ff1d7a9f4b4918561c5efb4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_invalid_arg.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_numba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_numba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e3afc884b4887e54ed296ce76e50be1abf28752 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_numba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..064615cdc729bf92da62afd7867ebc7f1157af1f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply_relabeling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply_relabeling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..243c213594f39bb4e70caaeed2cb9119f3d4c3af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_apply_relabeling.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_transform.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_transform.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24bc64b3fd055c4cda828d6b04a615da47ae07d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_series_transform.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_str.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_str.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b812132dea205fe570fb373c3913030aece5cf88 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/__pycache__/test_str.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/common.py new file mode 100644 index 0000000000000000000000000000000000000000..b4d153df54059ca2a82f336e19afb4297eb218a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/common.py @@ -0,0 +1,7 @@ +from pandas.core.groupby.base import transformation_kernels + +# There is no Series.cumcount or DataFrame.cumcount +series_transform_kernels = [ + x for x in sorted(transformation_kernels) if x != "cumcount" +] +frame_transform_kernels = [x for x in sorted(transformation_kernels) if x != "cumcount"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..b7eac6b8f0ea11cfdaaf760101eb407901c90319 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply.py @@ -0,0 +1,1733 @@ +from datetime import datetime +import warnings + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.frame.common import zip_frames + + +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df + + +@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + +def test_apply(float_frame, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") + request.node.add_marker(mark) + with np.errstate(all="ignore"): + # ufunc + result = np.sqrt(float_frame["A"]) + expected = float_frame.apply(np.sqrt, engine=engine)["A"] + tm.assert_series_equal(result, expected) + + # aggregator + result = float_frame.apply(np.mean, engine=engine)["A"] + expected = np.mean(float_frame["A"]) + assert result == expected + + d = float_frame.index[0] + result = float_frame.apply(np.mean, axis=1, engine=engine) + expected = np.mean(float_frame.xs(d)) + assert result[d] == expected + assert result.index is float_frame.index + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support args") + request.node.add_marker(mark) + result = float_frame.apply( + lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + ) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) + + +def test_apply_categorical_func(): + # GH 9573 + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + result = df.apply(lambda ts: ts.astype("category")) + + assert result.shape == (4, 2) + assert isinstance(result["c0"].dtype, CategoricalDtype) + assert isinstance(result["c1"].dtype, CategoricalDtype) + + +def test_apply_axis1_with_ea(): + # GH#36785 + expected = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) + result = expected.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, dtype", + [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)], +) +def test_agg_axis1_duplicate_index(data, dtype): + # GH 42380 + expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype) + result = expected.agg(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) + + +def test_apply_mixed_datetimelike(): + # mixed datetimelike + # GH 7778 + expected = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } + ) + result = expected.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", [np.sqrt, np.mean]) +def test_apply_empty(func, engine): + # empty + empty_frame = DataFrame() + + result = empty_frame.apply(func, engine=engine) + assert result.empty + + +def test_apply_float_frame(float_frame, engine): + no_rows = float_frame[:0] + result = no_rows.apply(lambda x: x.mean(), engine=engine) + expected = Series(np.nan, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + no_cols = float_frame.loc[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine) + expected = Series(np.nan, index=float_frame.index) + tm.assert_series_equal(result, expected) + + +def test_apply_empty_except_index(engine): + # GH 2476 + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1, engine=engine) + tm.assert_frame_equal(result, expected) + + +def test_apply_with_reduce_empty(): + # reduce with an empty DataFrame + empty_frame = DataFrame() + + x = [] + result = empty_frame.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_frame) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + expected = Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) + + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_with_cols) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + expected = Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) + + # Ensure that x.append hasn't been called + assert x == [] + + +@pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) +def test_apply_funcs_over_empty(func): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.apply(getattr(np, func)) + expected = getattr(df, func)() + if func in ("sum", "prod"): + expected = expected.astype(float) + tm.assert_series_equal(result, expected) + + +def test_nunique_empty(): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.nunique() + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.T.nunique() + expected = Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) + + +def test_apply_standard_nonunique(): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + + result = df.apply(lambda s: s[0], axis=1) + expected = Series([1, 4, 7], ["a", "a", "c"]) + tm.assert_series_equal(result, expected) + + result = df.T.apply(lambda s: s[0], axis=0) + tm.assert_series_equal(result, expected) + + +def test_apply_broadcast_scalars(float_frame): + # scalars + result = float_frame.apply(np.mean, result_type="broadcast") + expected = DataFrame([float_frame.mean()], index=float_frame.index) + tm.assert_frame_equal(result, expected) + + +def test_apply_broadcast_scalars_axis1(float_frame): + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") + m = float_frame.mean(axis=1) + expected = DataFrame({c: m for c in float_frame.columns}) + tm.assert_frame_equal(result, expected) + + +def test_apply_broadcast_lists_columns(float_frame): + # lists + result = float_frame.apply( + lambda x: list(range(len(float_frame.columns))), + axis=1, + result_type="broadcast", + ) + m = list(range(len(float_frame.columns))) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_broadcast_lists_index(float_frame): + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) + m = list(range(len(float_frame.index))) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_broadcast_list_lambda_func(int_frame_const_col): + # preserve columns + df = int_frame_const_col + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + tm.assert_frame_equal(result, df) + + +def test_apply_broadcast_series_lambda_func(int_frame_const_col): + df = int_frame_const_col + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 + + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) + expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) + tm.assert_series_equal(result, expected) + + +def test_apply_raw_float_frame_no_reduction(float_frame, engine): + # no reduction + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) + expected = float_frame * 2 + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_apply_raw_mixed_type_frame(axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 + + # Mixed dtype (GH-32423) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + df.apply(_assert_raw, axis=axis, engine=engine, raw=True) + + +def test_apply_axis1(float_frame): + d = float_frame.index[0] + result = float_frame.apply(np.mean, axis=1)[d] + expected = np.mean(float_frame.xs(d)) + assert result == expected + + +def test_apply_mixed_dtype_corner(): + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=pd.Index([], dtype="int64")) + tm.assert_series_equal(result, expected) + + +def test_apply_mixed_dtype_corner_indexing(): + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +@pytest.mark.parametrize("ax", ["index", "columns"]) +@pytest.mark.parametrize( + "func", [lambda x: x, lambda x: x.mean()], ids=["identity", "mean"] +) +@pytest.mark.parametrize("raw", [True, False]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): + df = DataFrame(**{ax: ["a", "b", "c"]}) + + with np.errstate(all="ignore"): + test_res = func(np.array([], dtype="f8")) + is_reduction = not isinstance(test_res, np.ndarray) + + result = df.apply(func, axis=axis, engine=engine, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + assert isinstance(result, Series) + assert result.index is agg_axis + else: + assert isinstance(result, DataFrame) + + +def test_apply_empty_infer_type_broadcast(): + no_cols = DataFrame(index=["a", "b", "c"]) + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") + assert isinstance(result, DataFrame) + + +def test_apply_with_args_kwds_add_some(float_frame): + def add_some(x, howmuch=0): + return x + howmuch + + result = float_frame.apply(add_some, howmuch=2) + expected = float_frame.apply(lambda x: x + 2) + tm.assert_frame_equal(result, expected) + + +def test_apply_with_args_kwds_agg_and_add(float_frame): + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch + + result = float_frame.apply(agg_and_add, howmuch=2) + expected = float_frame.apply(lambda x: x.mean() + 2) + tm.assert_series_equal(result, expected) + + +def test_apply_with_args_kwds_subtract_and_divide(float_frame): + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + + result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) + tm.assert_frame_equal(result, expected) + + +def test_apply_yield_list(float_frame): + result = float_frame.apply(list) + tm.assert_frame_equal(result, float_frame) + + +def test_apply_reduce_Series(float_frame): + float_frame.iloc[::2, float_frame.columns.get_loc("A")] = np.nan + expected = float_frame.mean(1) + result = float_frame.apply(np.mean, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_reduce_to_dict(): + # GH 25196 37544 + data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + + result = data.apply(dict, axis=0) + expected = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) + tm.assert_series_equal(result, expected) + + result = data.apply(dict, axis=1) + expected = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) + tm.assert_series_equal(result, expected) + + +def test_apply_differently_indexed(): + df = DataFrame(np.random.default_rng(2).standard_normal((20, 10))) + + result = df.apply(Series.describe, axis=0) + expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) + tm.assert_frame_equal(result, expected) + + result = df.apply(Series.describe, axis=1) + expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T + tm.assert_frame_equal(result, expected) + + +def test_apply_bug(): + # GH 6125 + positions = DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + + def f(r): + return r["market"] + + expected = positions.apply(f, axis=1) + + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + result = positions.apply(f, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_convert_objects(): + expected = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.default_rng(2).standard_normal(11), + "E": np.random.default_rng(2).standard_normal(11), + "F": np.random.default_rng(2).standard_normal(11), + } + ) + + result = expected.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) + + +def test_apply_attach_name(float_frame): + result = float_frame.apply(lambda x: x.name) + expected = Series(float_frame.columns, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + +def test_apply_attach_name_axis1(float_frame): + result = float_frame.apply(lambda x: x.name, axis=1) + expected = Series(float_frame.index, index=float_frame.index) + tm.assert_series_equal(result, expected) + + +def test_apply_attach_name_non_reduction(float_frame): + # non-reductions + result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_attach_name_non_reduction_axis1(float_frame): + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) + expected.index = float_frame.index + tm.assert_series_equal(result, expected) + + +def test_apply_multi_index(): + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"]) + tm.assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize( + "df, dicts", + [ + [ + DataFrame([["foo", "bar"], ["spam", "eggs"]]), + Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]), + ], + [DataFrame([[0, 1], [2, 3]]), Series([{0: 0, 1: 2}, {0: 1, 1: 3}])], + ], +) +def test_apply_dict(df, dicts): + # GH 8735 + fn = lambda x: x.to_dict() + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") + reduce_none = df.apply(fn) + + tm.assert_series_equal(reduce_true, dicts) + tm.assert_frame_equal(reduce_false, df) + tm.assert_series_equal(reduce_none, dicts) + + +def test_apply_non_numpy_dtype(): + # GH 12244 + df = DataFrame({"dt": date_range("2015-01-01", periods=3, tz="Europe/Brussels")}) + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_non_numpy_dtype_category(): + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + + +def test_apply_dup_names_multi_agg(): + # GH 21063 + df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["apply", "agg"]) +def test_apply_nested_result_axis_1(op): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + + df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = getattr(df, op)(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) + + +def test_apply_noreduction_tzaware_object(): + # https://github.com/pandas-dev/pandas/issues/31505 + expected = DataFrame( + {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" + ) + result = expected.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + result = expected.apply(lambda x: x.copy()) + tm.assert_frame_equal(result, expected) + + +def test_apply_function_runs_once(): + # https://github.com/pandas-dev/pandas/issues/30815 + + df = DataFrame({"a": [1, 2, 3]}) + names = [] # Save row names function is applied to + + def reducing_function(row): + names.append(row.name) + + def non_reducing_function(row): + names.append(row.name) + return row + + for func in [reducing_function, non_reducing_function]: + del names[:] + + df.apply(func, axis=1) + assert names == list(df.index) + + +def test_apply_raw_function_runs_once(engine): + # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") + + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save row values function is applied to + + def reducing_function(row): + values.extend(row) + + def non_reducing_function(row): + values.extend(row) + return row + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.apply(func, engine=engine, raw=True, axis=1) + assert values == list(df.a.to_list()) + + +def test_apply_with_byte_string(): + # GH 34529 + df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) + expected = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object) + # After we make the apply we expect a dataframe just + # like the original but with the object datatype + result = df.apply(lambda x: x.astype("object")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", ["asd", 12, None, np.nan]) +def test_apply_category_equalness(val): + # Check if categorical comparisons on apply, GH 21239 + df_values = ["asd", None, 12, "asd", "cde", np.nan] + df = DataFrame({"a": df_values}, dtype="category") + + result = df.a.apply(lambda x: x == val) + expected = Series( + [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" + ) + tm.assert_series_equal(result, expected) + + +# the user has supplied an opaque UDF where +# they are transforming the input that requires +# us to infer the output + + +def test_infer_row_shape(): + # GH 17437 + # if row shape is changing, infer it + df = DataFrame(np.random.default_rng(2).random((10, 2))) + result = df.apply(np.fft.fft, axis=0).shape + assert result == (10, 2) + + result = df.apply(np.fft.rfft, axis=0).shape + assert result == (6, 2) + + +@pytest.mark.parametrize( + "ops, by_row, expected", + [ + ({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})), + ({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})), + ({"a": lambda x: x.sum()}, "compat", Series({"a": 3})), + ({"a": lambda x: x.sum()}, False, Series({"a": 3})), + ( + {"a": ["sum", np.sum, lambda x: x.sum()]}, + "compat", + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + {"a": ["sum", np.sum, lambda x: x.sum()]}, + False, + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})), + ({"a": lambda x: 1}, False, Series({"a": 1})), + ], +) +def test_dictlike_lambda(ops, by_row, expected): + # GH53601 + df = DataFrame({"a": [1, 2]}) + result = df.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {"a": lambda x: x + 1}, + {"a": lambda x: x.sum()}, + {"a": ["sum", np.sum, lambda x: x.sum()]}, + {"a": lambda x: 1}, + ], +) +def test_dictlike_lambda_raises(ops): + # GH53601 + df = DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="by_row=True not allowed"): + df.apply(ops, by_row=True) + + +def test_with_dictlike_columns(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + tm.assert_series_equal(result, expected) + + # compose a series + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) + tm.assert_series_equal(result, expected) + + +def test_with_dictlike_columns_with_datetime(): + # GH 18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"], + dayfirst=True, + ) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + tm.assert_series_equal(result, expected) + + +def test_with_dictlike_columns_with_infer(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + expected = DataFrame({"s": [3, 3]}) + tm.assert_frame_equal(result, expected) + + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, by_row, expected", + [ + ([lambda x: x + 1], "compat", DataFrame({("a", ""): [2, 3]})), + ([lambda x: x + 1], False, DataFrame({("a", ""): [2, 3]})), + ([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=[""])), + ([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=[""])), + ( + ["sum", np.sum, lambda x: x.sum()], + "compat", + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + ["sum", np.sum, lambda x: x.sum()], + False, + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + [lambda x: x + 1, lambda x: 3], + "compat", + DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["", ""]]), + ), + ( + [lambda x: 2, lambda x: 3], + False, + DataFrame({"a": [2, 3]}, ["", ""]), + ), + ], +) +def test_listlike_lambda(ops, by_row, expected): + # GH53601 + df = DataFrame({"a": [1, 2]}) + result = df.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + [lambda x: x + 1], + [lambda x: x.sum()], + ["sum", np.sum, lambda x: x.sum()], + [lambda x: x + 1, lambda x: 3], + ], +) +def test_listlike_lambda_raises(ops): + # GH53601 + df = DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="by_row=True not allowed"): + df.apply(ops, by_row=True) + + +def test_with_listlike_columns(): + # GH 17348 + df = DataFrame( + { + "a": Series(np.random.default_rng(2).standard_normal(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="h"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) + tm.assert_series_equal(result, expected) + + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) + tm.assert_series_equal(result, expected) + + +def test_with_listlike_columns_returning_list(): + # GH 18919 + df = DataFrame({"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])}) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) + + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) + tm.assert_series_equal(result, expected) + + +def test_infer_output_shape_columns(): + # GH 18573 + + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + Timestamp("2017-11-29 03:30:00"), + Timestamp("2017-11-29 03:45:00"), + ], + } + ) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([(t.number, t.string) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +def test_infer_output_shape_listlike_columns(): + # GH 16353 + + df = DataFrame( + np.random.default_rng(2).standard_normal((6, 3)), columns=["A", "B", "C"] + ) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("val", [1, 2]) +def test_infer_output_shape_listlike_columns_np_func(val): + # GH 17970 + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) + + result = df.apply(lambda row: np.ones(val), axis=1) + expected = Series([np.ones(val) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + +def test_infer_output_shape_listlike_columns_with_timestamp(): + # GH 17892 + df = DataFrame( + { + "a": [ + Timestamp("2010-02-01"), + Timestamp("2010-02-04"), + Timestamp("2010-02-05"), + Timestamp("2010-02-06"), + ], + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("lst", [[1, 2, 3], [1, 2]]) +def test_consistent_coerce_for_shapes(lst): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] + ) + + result = df.apply(lambda x: lst, axis=1) + expected = Series([lst for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +def test_consistent_names(int_frame_const_col): + # if a Series is returned, we should use the resulting index names + df = int_frame_const_col + + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] + tm.assert_frame_equal(result, expected) + + +def test_result_type(int_frame_const_col): + # result_type should be consistent no matter which + # path we take in the code + df = int_frame_const_col + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + expected = df.copy() + expected.columns = [0, 1, 2] + tm.assert_frame_equal(result, expected) + + +def test_result_type_shorter_list(int_frame_const_col): + # result_type should be consistent no matter which + # path we take in the code + df = int_frame_const_col + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() + expected.columns = [0, 1] + tm.assert_frame_equal(result, expected) + + +def test_result_type_broadcast(int_frame_const_col, request, engine): + # result_type should be consistent no matter which + # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support list return") + request.node.add_marker(mark) + df = int_frame_const_col + # broadcast result + result = df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +def test_result_type_broadcast_series_func(int_frame_const_col, engine, request): + # result_type should be consistent no matter which + # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) + df = int_frame_const_col + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +def test_result_type_series_result(int_frame_const_col, engine, request): + # result_type should be consistent no matter which + # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) + df = int_frame_const_col + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +def test_result_type_series_result_other_index(int_frame_const_col, engine, request): + # result_type should be consistent no matter which + # path we take in the code + + if engine == "numba": + mark = pytest.mark.xfail( + reason="no support in numba Series constructor for list of columns" + ) + request.node.add_marker(mark) + df = int_frame_const_col + # series result with other index + columns = ["other", "col", "names"] + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine) + expected = df.copy() + expected.columns = columns + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "box", + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], +) +def test_consistency_for_boxed(box, int_frame_const_col): + # passing an array or list should not affect the output shape + df = int_frame_const_col + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) + tm.assert_frame_equal(result, expected) + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + expected = f_sqrt.copy() + result = float_frame.apply(np.sqrt, axis=axis) + tm.assert_frame_equal(result, expected) + + # list-like + result = float_frame.apply([np.sqrt], axis=axis) + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + result = float_frame.apply([np.abs, np.sqrt], axis=axis) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + tm.assert_frame_equal(result, expected) + + +def test_demo(): + # demonstration tests + df = DataFrame({"A": range(5), "B": 5}) + + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) + tm.assert_frame_equal(result, expected) + + +def test_demo_dict_agg(): + # demonstration tests + df = DataFrame({"A": range(5), "B": 5}) + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + +def test_agg_with_name_as_column_name(): + # GH 36212 - Column name is "name" + data = {"name": ["foo", "bar"]} + df = DataFrame(data) + + # result's name should be None + result = df.agg({"name": "count"}) + expected = Series({"name": 2}) + tm.assert_series_equal(result, expected) + + # Check if name is still preserved when aggregating series instead + result = df["name"].agg({"name": "count"}) + expected = Series({"name": 2}, name="name") + tm.assert_series_equal(result, expected) + + +def test_agg_multiple_mixed(): + # GH 20909 + mdf = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + } + ) + expected = DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + }, + index=["min", "sum"], + ) + # sorted index + result = mdf.agg(["min", "sum"]) + tm.assert_frame_equal(result, expected) + + result = mdf[["C", "B", "A"]].agg(["sum", "min"]) + # GH40420: the result of .agg should have an index that is sorted + # according to the arguments provided to agg. + expected = expected[["C", "B", "A"]].reindex(["sum", "min"]) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_mixed_raises(): + # GH 20909 + mdf = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": date_range("20130101", periods=3), + } + ) + + # sorted index + msg = "does not support reduction" + with pytest.raises(TypeError, match=msg): + mdf.agg(["min", "sum"]) + + with pytest.raises(TypeError, match=msg): + mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + + +def test_agg_reduce(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() + + # all reducers + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) + tm.assert_frame_equal(result, expected) + + # dict input with scalars + func = {name1: "mean", name2: "sum"} + result = float_frame.agg(func, axis=axis) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) + tm.assert_series_equal(result, expected) + + # dict input with lists + func = {name1: ["mean"], name2: ["sum"]} + result = float_frame.agg(func, axis=axis) + expected = DataFrame( + { + name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]), + name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]), + } + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + # dict input with lists with multiple + func = {name1: ["mean", "sum"], name2: ["sum", "max"]} + result = float_frame.agg(func, axis=axis) + expected = pd.concat( + { + name1: Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), + ], + index=["mean", "sum"], + ), + name2: Series( + [ + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), + ], + index=["sum", "max"], + ), + }, + axis=1, + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + +def test_nuiscance_columns(): + # GH 15015 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": date_range("20130101", periods=3), + } + ) + + result = df.agg("min") + expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", Timestamp("20130101").as_unit("ns")]], + index=["min"], + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + msg = "does not support reduction" + with pytest.raises(TypeError, match=msg): + df.agg("sum") + + result = df[["A", "B", "C"]].agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + msg = "does not support reduction" + with pytest.raises(TypeError, match=msg): + df.agg(["sum"]) + + +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_non_callable_aggregates(how): + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + # GH 39116 - expand to apply + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + + # Function aggregate + result = getattr(df, how)({"A": "count"}) + expected = Series({"A": 2}) + + tm.assert_series_equal(result, expected) + + # Non-function aggregate + result = getattr(df, how)({"A": "size"}) + expected = Series({"A": 3}) + + tm.assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = getattr(df, how)(["count", "size"]) + result2 = getattr(df, how)( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) + + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) + + # Just functional string arg is same as calling df.arg() + result = getattr(df, how)("count") + expected = df.count() + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_size_as_str(how, axis): + # GH 39934 + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + # Just a string attribute arg same as calling df.arg + # on the columns + result = getattr(df, how)("size", axis=axis) + if axis in (0, "index"): + expected = Series(df.shape[0], index=df.columns) + else: + expected = Series(df.shape[1], index=df.index) + tm.assert_series_equal(result, expected) + + +def test_agg_listlike_result(): + # GH-29587 user defined function returning list-likes + df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]}) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize( + "args, kwargs", + [ + ((1, 2, 3), {}), + ((8, 7, 15), {}), + ((1, 2), {}), + ((1,), {"b": 2}), + ((), {"a": 1, "b": 2}), + ((), {"a": 2, "b": 1}), + ((), {"a": 1, "b": 2, "c": 3}), + ], +) +def test_agg_args_kwargs(axis, args, kwargs): + def f(x, a, b, c=3): + return x.sum() + (a + b) / c + + df = DataFrame([[1, 2], [3, 4]]) + + if axis == 0: + expected = Series([5.0, 7.0]) + else: + expected = Series([4.0, 8.0]) + + result = df.agg(f, axis, *args, **kwargs) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("num_cols", [2, 3, 5]) +def test_frequency_is_original(num_cols, engine, request): + # GH 22150 + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine only supports numeric indices") + request.node.add_marker(mark) + index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) + original = index.copy() + df = DataFrame(1, index=index, columns=range(num_cols)) + df.apply(lambda x: x, engine=engine) + assert index.freq == original.freq + + +def test_apply_datetime_tz_issue(engine, request): + # GH 29052 + + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support non-numeric indexes" + ) + request.node.add_marker(mark) + + timestamps = [ + Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1, engine=engine) + expected = Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) +@pytest.mark.parametrize("method", ["min", "max", "sum"]) +def test_mixed_column_raises(df, method, using_infer_string): + # GH 16832 + if method == "sum": + msg = r'can only concatenate str \(not "int"\) to str|does not support' + else: + msg = "not supported between instances of 'str' and 'float'" + if not using_infer_string: + with pytest.raises(TypeError, match=msg): + getattr(df, method)() + else: + getattr(df, method)() + + +@pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) +def test_apply_dtype(col): + # GH 31466 + df = DataFrame([[1.0, col]], columns=["a", "b"]) + result = df.apply(lambda x: x.dtype) + expected = df.dtypes + + tm.assert_series_equal(result, expected) + + +def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write): + # GH#35462 case where applied func pins a new BlockManager to a row + df = DataFrame({"a": range(100), "b": range(100, 200)}) + df_orig = df.copy() + + def func(row): + mgr = row._mgr + row.loc["a"] += 1 + assert row._mgr is not mgr + return row + + expected = df.copy() + expected["a"] += 1 + + with tm.assert_cow_warning(warn_copy_on_write): + result = df.apply(func, axis=1) + + tm.assert_frame_equal(result, expected) + if using_copy_on_write or using_array_manager: + # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent + # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place, + # with ArrayManager the row is not a view, and thus not mutated in place + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, result) + + +def test_apply_empty_list_reduce(): + # GH#35683 get columns correct + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) + + result = df.apply(lambda x: [], result_type="reduce") + expected = Series({"a": [], "b": []}, dtype=object) + tm.assert_series_equal(result, expected) + + +def test_apply_no_suffix_index(engine, request): + # GH36189 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support list-likes/dict-like callables" + ) + request.node.add_marker(mark) + pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) + result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine) + expected = DataFrame( + {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] + ) + + tm.assert_frame_equal(result, expected) + + +def test_apply_raw_returns_string(engine): + # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") + df = DataFrame({"A": ["aa", "bbb"]}) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) + expected = Series(["aa", "bbb"]) + tm.assert_series_equal(result, expected) + + +def test_aggregation_func_column_order(): + # GH40420: the result of .agg should have an index that is sorted + # according to the arguments provided to agg. + df = DataFrame( + [ + (1, 0, 0), + (2, 0, 0), + (3, 0, 0), + (4, 5, 4), + (5, 6, 6), + (6, 7, 7), + ], + columns=("att1", "att2", "att3"), + ) + + def sum_div2(s): + return s.sum() / 2 + + aggs = ["sum", sum_div2, "count", "min"] + result = df.agg(aggs) + expected = DataFrame( + { + "att1": [21.0, 10.5, 6.0, 1.0], + "att2": [18.0, 9.0, 6.0, 0.0], + "att3": [17.0, 8.5, 6.0, 0.0], + }, + index=["sum", "sum_div2", "count", "min"], + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_getitem_axis_1(engine, request): + # GH 13427 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine not supporting duplicate index values" + ) + request.node.add_marker(mark) + df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]}) + result = df[["a", "a"]].apply( + lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine + ) + expected = Series([0, 2, 4]) + tm.assert_series_equal(result, expected) + + +def test_nuisance_depr_passes_through_warnings(): + # GH 43740 + # DataFrame.agg with list-likes may emit warnings for both individual + # args and for entire columns, but we only want to emit once. We + # catch and suppress the warnings for individual args, but need to make + # sure if some other warnings were raised, they get passed through to + # the user. + + def expected_warning(x): + warnings.warn("Hello, World!") + return x.sum() + + df = DataFrame({"a": [1, 2, 3]}) + with tm.assert_produces_warning(UserWarning, match="Hello, World!"): + df.agg([expected_warning]) + + +def test_apply_type(): + # GH 46719 + df = DataFrame( + {"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]}, + index=["a", "b", "c"], + ) + + # axis=0 + result = df.apply(type, axis=0) + expected = Series({"col1": Series, "col2": Series}) + tm.assert_series_equal(result, expected) + + # axis=1 + result = df.apply(type, axis=1) + expected = Series({"a": Series, "b": Series, "c": Series}) + tm.assert_series_equal(result, expected) + + +def test_apply_on_empty_dataframe(engine): + # GH 39111 + df = DataFrame({"a": [1, 2], "b": [3, 0]}) + result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine) + expected = Series([], dtype=np.float64) + tm.assert_series_equal(result, expected) + + +def test_apply_return_list(): + df = DataFrame({"a": [1, 2], "b": [2, 3]}) + result = df.apply(lambda x: [x.values]) + expected = DataFrame({"a": [[1, 2]], "b": [[2, 3]]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, constant", + [ + ({"a": [1, 2, 3], "b": [1, 1, 1]}, {"a": [1, 2, 3], "b": [1]}), + ({"a": [2, 2, 2], "b": [1, 1, 1]}, {"a": [2], "b": [1]}), + ], +) +def test_unique_agg_type_is_series(test, constant): + # GH#22558 + df1 = DataFrame(test) + expected = Series(data=constant, index=["a", "b"], dtype="object") + aggregation = {"a": "unique", "b": "unique"} + + result = df1.agg(aggregation) + + tm.assert_series_equal(result, expected) + + +def test_any_apply_keyword_non_zero_axis_regression(): + # https://github.com/pandas-dev/pandas/issues/48656 + df = DataFrame({"A": [1, 2, 0], "B": [0, 2, 0], "C": [0, 0, 0]}) + expected = Series([True, True, False]) + tm.assert_series_equal(df.any(axis=1), expected) + + result = df.apply("any", axis=1) + tm.assert_series_equal(result, expected) + + result = df.apply("any", 1) + tm.assert_series_equal(result, expected) + + +def test_agg_mapping_func_deprecated(): + # GH 53325 + df = DataFrame({"x": [1, 2, 3]}) + + def foo1(x, a=1, c=0): + return x + a + c + + def foo2(x, b=2, c=0): + return x + b + c + + # single func already takes the vectorized path + result = df.agg(foo1, 0, 3, c=4) + expected = df + 7 + tm.assert_frame_equal(result, expected) + + msg = "using .+ in Series.agg cannot aggregate and" + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg([foo1, foo2], 0, 3, c=4) + expected = DataFrame( + [[8, 8], [9, 9], [10, 10]], columns=[["x", "x"], ["foo1", "foo2"]] + ) + tm.assert_frame_equal(result, expected) + + # TODO: the result below is wrong, should be fixed (GH53325) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg({"x": foo1}, 0, 3, c=4) + expected = DataFrame([2, 3, 4], columns=["x"]) + tm.assert_frame_equal(result, expected) + + +def test_agg_std(): + df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"]) + + with tm.assert_produces_warning(FutureWarning, match="using DataFrame.std"): + result = df.agg(np.std) + expected = Series({"A": 2.0, "B": 2.0}, dtype=float) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match="using Series.std"): + result = df.agg([np.std]) + expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"]) + tm.assert_frame_equal(result, expected) + + +def test_agg_dist_like_and_nonunique_columns(): + # GH#51099 + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + df.columns = ["A", "A", "C"] + + result = df.agg({"A": "count"}) + expected = df["A"].count() + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply_relabeling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply_relabeling.py new file mode 100644 index 0000000000000000000000000000000000000000..723bdd349c0cb8a8f3fe73ded665b6d22260ffb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_apply_relabeling.py @@ -0,0 +1,113 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gte1p25 + +import pandas as pd +import pandas._testing as tm + + +def test_agg_relabel(): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multi_columns_multi_methods(): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(np_version_gte1p25, reason="name of min now equals name of np.min") +def test_agg_relabel_partial_functions(): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + msg = "using Series.[mean|min]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + msg = "using Series.[mean|min|max|sum]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_namedtuple(): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", "min"), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_reconstruct_func(): + # GH 28472, test to ensure reconstruct_func isn't moved; + # This method is used by other libraries (e.g. dask) + result = pd.core.apply.reconstruct_func("min") + expected = (False, "min", None, None) + tm.assert_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_transform.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..558d76ae8fdc4b95d46bbe94e15822779bd7c53f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_frame_transform.py @@ -0,0 +1,264 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, + Series, +) +import pandas._testing as tm +from pandas.tests.apply.common import frame_transform_kernels +from pandas.tests.frame.common import zip_frames + + +def unpack_obj(obj, klass, axis): + """ + Helper to ensure we have the right type of object for a test parametrized + over frame_or_series. + """ + if klass is not DataFrame: + obj = obj["A"] + if axis != 0: + pytest.skip(f"Test is only for DataFrame with axis={axis}") + return obj + + +def test_transform_ufunc(axis, float_frame, frame_or_series): + # GH 35964 + obj = unpack_obj(float_frame, frame_or_series, axis) + + with np.errstate(all="ignore"): + f_sqrt = np.sqrt(obj) + + # ufunc + result = obj.transform(np.sqrt, axis=axis) + expected = f_sqrt + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, names", + [ + ([np.sqrt], ["sqrt"]), + ([np.abs, np.sqrt], ["absolute", "sqrt"]), + (np.array([np.sqrt]), ["sqrt"]), + (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), + ], +) +def test_transform_listlike(axis, float_frame, ops, names): + # GH 35964 + other_axis = 1 if axis in {0, "index"} else 0 + with np.errstate(all="ignore"): + expected = zip_frames([op(float_frame) for op in ops], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, names]) + else: + expected.index = MultiIndex.from_product([float_frame.index, names]) + result = float_frame.transform(ops, axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("ops", [[], np.array([])]) +def test_transform_empty_listlike(float_frame, ops, frame_or_series): + obj = unpack_obj(float_frame, frame_or_series, 0) + + with pytest.raises(ValueError, match="No transform functions were provided"): + obj.transform(ops) + + +def test_transform_listlike_func_with_args(): + # GH 50624 + df = DataFrame({"x": [1, 2, 3]}) + + def foo1(x, a=1, c=0): + return x + a + c + + def foo2(x, b=2, c=0): + return x + b + c + + msg = r"foo1\(\) got an unexpected keyword argument 'b'" + with pytest.raises(TypeError, match=msg): + df.transform([foo1, foo2], 0, 3, b=3, c=4) + + result = df.transform([foo1, foo2], 0, 3, c=4) + expected = DataFrame( + [[8, 8], [9, 9], [10, 10]], + columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("box", [dict, Series]) +def test_transform_dictlike(axis, float_frame, box): + # GH 35964 + if axis in (0, "index"): + e = float_frame.columns[0] + expected = float_frame[[e]].transform(np.abs) + else: + e = float_frame.index[0] + expected = float_frame.iloc[[0]].transform(np.abs) + result = float_frame.transform(box({e: np.abs}), axis=axis) + tm.assert_frame_equal(result, expected) + + +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = DataFrame({"a": [1, 2], "b": [1, 4], "c": [1, 4]}) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {}, + {"A": []}, + {"A": [], "B": "cumsum"}, + {"A": "cumsum", "B": []}, + {"A": [], "B": ["cumsum"]}, + {"A": ["cumsum"], "B": []}, + ], +) +def test_transform_empty_dictlike(float_frame, ops, frame_or_series): + obj = unpack_obj(float_frame, frame_or_series, 0) + + with pytest.raises(ValueError, match="No transform functions were provided"): + obj.transform(ops) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_udf(axis, float_frame, use_apply, frame_or_series): + # GH 35964 + obj = unpack_obj(float_frame, frame_or_series, axis) + + # transform uses UDF either via apply or passing the entire DataFrame + def func(x): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, frame_or_series): + # Force transform to fallback + raise ValueError + return x + 1 + + result = obj.transform(func, axis=axis) + expected = obj + 1 + tm.assert_equal(result, expected) + + +wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] +frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail] + + +@pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1]) +def test_transform_bad_dtype(op, frame_or_series, request): + # GH 35964 + if op == "ngroup": + request.applymarker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + + obj = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms + obj = tm.get_obj(obj, frame_or_series) + error = TypeError + msg = "|".join( + [ + "not supported between instances of 'type' and 'type'", + "unsupported operand type", + ] + ) + + with pytest.raises(error, match=msg): + obj.transform(op) + with pytest.raises(error, match=msg): + obj.transform([op]) + with pytest.raises(error, match=msg): + obj.transform({"A": op}) + with pytest.raises(error, match=msg): + obj.transform({"A": [op]}) + + +@pytest.mark.parametrize("op", frame_kernels_raise) +def test_transform_failure_typeerror(request, op): + # GH 35964 + + if op == "ngroup": + request.applymarker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + + # Using object makes most transform kernels fail + df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) + error = TypeError + msg = "|".join( + [ + "not supported between instances of 'type' and 'type'", + "unsupported operand type", + ] + ) + + with pytest.raises(error, match=msg): + df.transform([op]) + + with pytest.raises(error, match=msg): + df.transform({"A": op, "B": op}) + + with pytest.raises(error, match=msg): + df.transform({"A": [op], "B": [op]}) + + with pytest.raises(error, match=msg): + df.transform({"A": [op, "shift"], "B": [op]}) + + +def test_transform_failure_valueerror(): + # GH 40211 + def op(x): + if np.sum(np.sum(x)) < 10: + raise ValueError + return x + + df = DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) + msg = "Transform function failed" + + with pytest.raises(ValueError, match=msg): + df.transform([op]) + + with pytest.raises(ValueError, match=msg): + df.transform({"A": op, "B": op}) + + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op], "B": [op]}) + + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op, "shift"], "B": [op]}) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply, frame_or_series): + # GH 35964 + # transform uses UDF either via apply or passing the entire DataFrame + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, frame_or_series): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + frame_or_series([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/39636 + df = DataFrame([], columns=["col1", "col2"]) + result = df.transform(lambda x: x + 10) + tm.assert_frame_equal(result, df) + + result = df["col1"].transform(lambda x: x + 10) + tm.assert_series_equal(result, df["col1"]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_invalid_arg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_invalid_arg.py new file mode 100644 index 0000000000000000000000000000000000000000..b5ad1094f5bf5a839600457e133aea1540655cc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_invalid_arg.py @@ -0,0 +1,361 @@ +# Tests specifically aimed at detecting bad arguments. +# This file is organized by reason for exception. +# 1. always invalid argument values +# 2. missing column(s) +# 3. incompatible ops/dtype/args/kwargs +# 4. invalid result shape/type +# If your test does not fit into one of these categories, add to this list. + +from itertools import chain +import re + +import numpy as np +import pytest + +from pandas.errors import SpecificationError + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("result_type", ["foo", 1]) +def test_result_type_error(result_type): + # allowed result_type + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + + +def test_apply_invalid_axis_value(): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: x, 2) + + +def test_agg_raises(): + # GH 26513 + df = DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() + + +def test_map_with_invalid_na_action_raises(): + # https://github.com/pandas-dev/pandas/issues/32815 + s = Series([1, 2, 3]) + msg = "na_action must either be 'ignore' or None" + with pytest.raises(ValueError, match=msg): + s.map(lambda x: x, na_action="____") + + +@pytest.mark.parametrize("input_na_action", ["____", True]) +def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action): + # https://github.com/pandas-dev/pandas/issues/46588 + s = Series([1, 2, 3]) + msg = f"na_action must either be 'ignore' or None, {input_na_action} was passed" + with pytest.raises(ValueError, match=msg): + s.map({1: 2}, na_action=input_na_action) + + +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}]) +def test_nested_renamer(frame_or_series, method, func): + # GH 35964 + obj = frame_or_series({"A": [1]}) + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + getattr(obj, method)(func) + + +@pytest.mark.parametrize( + "renamer", + [{"foo": ["min", "max"]}, {"foo": ["min", "max"], "bar": ["sum", "mean"]}], +) +def test_series_nested_renamer(renamer): + s = Series(range(6), dtype="int64", name="series") + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg(renamer) + + +def test_apply_dict_depr(): + tsdf = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), + columns=["A", "B", "C"], + index=date_range("1/1/2000", periods=10), + ) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + tsdf.A.agg({"foo": ["sum", "mean"]}) + + +@pytest.mark.parametrize("method", ["agg", "transform"]) +def test_dict_nested_renaming_depr(method): + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + getattr(df, method)({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}]) +def test_missing_column(method, func): + # GH 40004 + obj = DataFrame({"A": [1]}) + match = re.escape("Column(s) ['B'] do not exist") + with pytest.raises(KeyError, match=match): + getattr(obj, method)(func) + + +def test_transform_mixed_column_name_dtypes(): + # GH39025 + df = DataFrame({"a": ["1"]}) + msg = r"Column\(s\) \[1, 'b'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"a": int, 1: str, "b": int}) + + +@pytest.mark.parametrize( + "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] +) +def test_apply_str_axis_1_raises(how, args): + # GH 39211 - some ops don't support axis=1 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + msg = f"Operation {how} does not support axis=1" + with pytest.raises(ValueError, match=msg): + df.apply(how, axis=1, args=args) + + +def test_transform_axis_1_raises(): + # GH 35964 + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + Series([1]).transform("sum", axis=1) + + +def test_apply_modify_traceback(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.default_rng(2).standard_normal(11), + "E": np.random.default_rng(2).standard_normal(11), + "F": np.random.default_rng(2).standard_normal(11), + } + ) + + data.loc[4, "C"] = np.nan + + def transform(row): + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): + data.apply(transform, axis=1) + + +@pytest.mark.parametrize( + "df, func, expected", + tm.get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), +) +def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): + # GH 21224 + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + warn = None if isinstance(func, str) else FutureWarning + with pytest.raises(expected, match=msg): + with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): + df.agg(func, axis=axis) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), +) +def test_agg_cython_table_raises_series(series, func, expected, using_infer_string): + # GH21224 + msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + if func == "median" or func is np.nanmedian or func is np.median: + msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = msg + "|does not support|has no kernel" + warn = None if isinstance(func, str) else FutureWarning + + with pytest.raises(expected, match=msg): + # e.g. Series('a b'.split()).cumprod() will raise + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + series.agg(func) + + +def test_agg_none_to_type(): + # GH 40543 + df = DataFrame({"a": [None]}) + msg = re.escape("int() argument must be a string") + with pytest.raises(TypeError, match=msg): + df.agg({"a": lambda x: int(x.iloc[0])}) + + +def test_transform_none_to_type(): + # GH#34377 + df = DataFrame({"a": [None]}) + msg = "argument must be a" + with pytest.raises(TypeError, match=msg): + df.transform({"a": lambda x: int(x.iloc[0])}) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: np.array([1, 2]).reshape(-1, 2), + lambda x: [1, 2], + lambda x: Series([1, 2]), + ], +) +def test_apply_broadcast_error(func): + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + + # > 1 ndim + msg = "too many dims to broadcast|cannot broadcast result" + with pytest.raises(ValueError, match=msg): + df.apply(func, axis=1, result_type="broadcast") + + +def test_transform_and_agg_err_agg(axis, float_frame): + # cannot both transform and agg + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) + + +@pytest.mark.filterwarnings("ignore::FutureWarning") # GH53325 +@pytest.mark.parametrize( + "func, msg", + [ + (["sqrt", "max"], "cannot combine transform and aggregation"), + ( + {"foo": np.sqrt, "bar": "sum"}, + "cannot perform both aggregation and transformation", + ), + ], +) +def test_transform_and_agg_err_series(string_series, func, msg): + # we are trying to transform with an aggregator + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg(func) + + +@pytest.mark.parametrize("func", [["max", "min"], ["max", "sqrt"]]) +def test_transform_wont_agg_frame(axis, float_frame, func): + # GH 35964 + # cannot both transform and agg + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(func, axis=axis) + + +@pytest.mark.parametrize("func", [["min", "max"], ["sqrt", "max"]]) +def test_transform_wont_agg_series(string_series, func): + # GH 35964 + # we are trying to transform with an aggregator + msg = "Function did not transform" + + with pytest.raises(ValueError, match=msg): + string_series.transform(func) + + +@pytest.mark.parametrize( + "op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}] +) +def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper): + # GH 35964 + op = op_wrapper(all_reductions) + + obj = DataFrame({"A": [1, 2, 3]}) + obj = tm.get_obj(obj, frame_or_series) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + obj.transform(op) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_numba.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_numba.py new file mode 100644 index 0000000000000000000000000000000000000000..57b81711ddb48a7390d33b4624f9956fbcd473a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_numba.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] + + +@pytest.fixture(params=[0, 1]) +def apply_axis(request): + return request.param + + +def test_numba_vs_python_noop(float_frame, apply_axis): + func = lambda x: x + result = float_frame.apply(func, engine="numba", axis=apply_axis) + expected = float_frame.apply(func, engine="python", axis=apply_axis) + tm.assert_frame_equal(result, expected) + + +def test_numba_vs_python_string_index(): + # GH#56189 + pytest.importorskip("pyarrow") + df = DataFrame( + 1, + index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + func = lambda x: x + result = df.apply(func, engine="numba", axis=0) + expected = df.apply(func, engine="python", axis=0) + tm.assert_frame_equal( + result, expected, check_column_type=False, check_index_type=False + ) + + +def test_numba_vs_python_indexing(): + frame = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, + index=Index(["A", "B", "C"]), + ) + row_func = lambda x: x["c"] + result = frame.apply(row_func, engine="numba", axis=1) + expected = frame.apply(row_func, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + col_func = lambda x: x["A"] + result = frame.apply(col_func, engine="numba", axis=0) + expected = frame.apply(col_func, engine="python", axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "reduction", + [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], +) +def test_numba_vs_python_reductions(reduction, apply_axis): + df = DataFrame(np.ones((4, 4), dtype=np.float64)) + result = df.apply(reduction, engine="numba", axis=apply_axis) + expected = df.apply(reduction, engine="python", axis=apply_axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]]) +def test_numba_numeric_colnames(colnames): + # Check that numeric column names lower properly and can be indxed on + df = DataFrame( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames + ) + first_col = colnames[0] + f = lambda x: x[first_col] # Get the first column + result = df.apply(f, engine="numba", axis=1) + expected = df.apply(f, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + +def test_numba_parallel_unsupported(float_frame): + f = lambda x: x + with pytest.raises( + NotImplementedError, + match="Parallel apply is not supported when raw=False and engine='numba'", + ): + float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True}) + + +def test_numba_nonunique_unsupported(apply_axis): + f = lambda x: x + df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"])) + with pytest.raises( + NotImplementedError, + match="The index/columns must be unique when raw=False and engine='numba'", + ): + df.apply(f, engine="numba", axis=apply_axis) + + +def test_numba_unsupported_dtypes(apply_axis): + f = lambda x: x + df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) + df["c"] = df["c"].astype("double[pyarrow]") + + with pytest.raises( + ValueError, + match="Column b must have a numeric dtype. Found 'object|string' instead", + ): + df.apply(f, engine="numba", axis=apply_axis) + + with pytest.raises( + ValueError, + match="Column c is backed by an extension array, " + "which is not supported by the numba engine.", + ): + df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..df24fa08f48e1530cd3b3abca708a0d7743cd01d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply.py @@ -0,0 +1,701 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + concat, + date_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.tests.apply.common import series_transform_kernels + + +@pytest.fixture(params=[False, "compat"]) +def by_row(request): + return request.param + + +def test_series_map_box_timedelta(by_row): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=3, freq="h")) + + def f(x): + return x.total_seconds() if by_row else x.dt.total_seconds() + + result = ser.apply(f, by_row=by_row) + + expected = ser.map(lambda x: x.total_seconds()) + tm.assert_series_equal(result, expected) + + expected = Series([86401.0, 90001.0, 93601.0]) + tm.assert_series_equal(result, expected) + + +def test_apply(datetime_series, by_row): + result = datetime_series.apply(np.sqrt, by_row=by_row) + with np.errstate(all="ignore"): + expected = np.sqrt(datetime_series) + tm.assert_series_equal(result, expected) + + # element-wise apply (ufunc) + result = datetime_series.apply(np.exp, by_row=by_row) + expected = np.exp(datetime_series) + tm.assert_series_equal(result, expected) + + # empty series + s = Series(dtype=object, name="foo", index=Index([], name="bar")) + rs = s.apply(lambda x: x, by_row=by_row) + tm.assert_series_equal(s, rs) + + # check all metadata (GH 9322) + assert s is not rs + assert s.index is rs.index + assert s.dtype == rs.dtype + assert s.name == rs.name + + # index but no data + s = Series(index=[1, 2, 3], dtype=np.float64) + rs = s.apply(lambda x: x, by_row=by_row) + tm.assert_series_equal(s, rs) + + +def test_apply_map_same_length_inference_bug(): + s = Series([1, 2]) + + def f(x): + return (x, x + 1) + + result = s.apply(f, by_row="compat") + expected = s.map(f) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("convert_dtype", [True, False]) +def test_apply_convert_dtype_deprecated(convert_dtype): + ser = Series(np.random.default_rng(2).standard_normal(10)) + + def func(x): + return x if x > 0 else np.nan + + with tm.assert_produces_warning(FutureWarning): + ser.apply(func, convert_dtype=convert_dtype, by_row="compat") + + +def test_apply_args(): + s = Series(["foo,bar"]) + + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] + assert isinstance(result[0], list) + + +@pytest.mark.parametrize( + "args, kwargs, increment", + [((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)], +) +def test_agg_args(args, kwargs, increment): + # GH 43357 + def f(x, a=0, b=0, c=0): + return x + a + 10 * b + 100 * c + + s = Series([1, 2]) + msg = ( + "in Series.agg cannot aggregate and has been deprecated. " + "Use Series.transform to keep behavior unchanged." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.agg(f, 0, *args, **kwargs) + expected = s + increment + tm.assert_series_equal(result, expected) + + +def test_agg_mapping_func_deprecated(): + # GH 53325 + s = Series([1, 2, 3]) + + def foo1(x, a=1, c=0): + return x + a + c + + def foo2(x, b=2, c=0): + return x + b + c + + msg = "using .+ in Series.agg cannot aggregate and" + with tm.assert_produces_warning(FutureWarning, match=msg): + s.agg(foo1, 0, 3, c=4) + with tm.assert_produces_warning(FutureWarning, match=msg): + s.agg([foo1, foo2], 0, 3, c=4) + with tm.assert_produces_warning(FutureWarning, match=msg): + s.agg({"a": foo1, "b": foo2}, 0, 3, c=4) + + +def test_series_apply_map_box_timestamps(by_row): + # GH#2689, GH#2627 + ser = Series(date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + if not by_row: + msg = "Series' object has no attribute 'hour'" + with pytest.raises(AttributeError, match=msg): + ser.apply(func, by_row=by_row) + return + + result = ser.apply(func, by_row=by_row) + expected = ser.map(func) + tm.assert_series_equal(result, expected) + + +def test_apply_box_dt64(): + # ufunc will not be boxed. Same test cases as the test_map_box + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + ser = Series(vals, dtype="M8[ns]") + assert ser.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + +def test_apply_box_dt64tz(): + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + ser = Series(vals, dtype="M8[ns, US/Eastern]") + assert ser.dtype == "datetime64[ns, US/Eastern]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + +def test_apply_box_td64(): + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + ser = Series(vals) + assert ser.dtype == "timedelta64[ns]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + +def test_apply_box_period(): + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + ser = Series(vals) + assert ser.dtype == "Period[M]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +def test_apply_datetimetz(by_row): + values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo") + s = Series(values, name="XX") + + result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row) + exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( + "Asia/Tokyo" + ) + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + result = s.apply(lambda x: x.hour if by_row else x.dt.hour, by_row=by_row) + exp = Series(list(range(24)) + [0], name="XX", dtype="int64" if by_row else "int32") + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + return str(x.tz) if by_row else str(x.dt.tz) + + result = s.apply(f, by_row=by_row) + if by_row: + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) + else: + assert result == "Asia/Tokyo" + + +def test_apply_categorical(by_row, using_infer_string): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = Series(values, name="XX", index=list("abcdefg")) + + if not by_row: + msg = "Series' object has no attribute 'lower" + with pytest.raises(AttributeError, match=msg): + ser.apply(lambda x: x.lower(), by_row=by_row) + assert ser.apply(lambda x: "A", by_row=by_row) == "A" + return + + result = ser.apply(lambda x: x.lower(), by_row=by_row) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + + +@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) +def test_apply_categorical_with_nan_values(series, by_row): + # GH 20714 bug fixed in: GH 24275 + s = Series(series, dtype="category") + if not by_row: + msg = "'Series' object has no attribute 'split'" + with pytest.raises(AttributeError, match=msg): + s.apply(lambda x: x.split("-")[0], by_row=by_row) + return + + result = s.apply(lambda x: x.split("-")[0], by_row=by_row) + result = result.astype(object) + expected = Series(["1", "1", np.nan], dtype="category") + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + +def test_apply_empty_integer_series_with_datetime_index(by_row): + # GH 21245 + s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int) + result = s.apply(lambda x: x, by_row=by_row) + tm.assert_series_equal(result, s) + + +def test_apply_dataframe_iloc(): + uintDF = DataFrame(np.uint64([1, 2, 3, 4, 5]), columns=["Numbers"]) + indexDF = DataFrame([2, 3, 2, 1, 2], columns=["Indices"]) + + def retrieve(targetRow, targetDF): + val = targetDF["Numbers"].iloc[targetRow] + return val + + result = indexDF["Indices"].apply(retrieve, args=(uintDF,)) + expected = Series([3, 4, 3, 2, 3], name="Indices", dtype="uint64") + tm.assert_series_equal(result, expected) + + +def test_transform(string_series, by_row): + # transforming functions + + with np.errstate(all="ignore"): + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.apply(np.sqrt, by_row=by_row) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.apply([np.sqrt], by_row=by_row) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.apply(["sqrt"], by_row=by_row) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "absolute"] + result = string_series.apply([np.sqrt, np.abs], by_row=by_row) + tm.assert_frame_equal(result, expected) + + # dict, provide renaming + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") + + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}, by_row=by_row) + tm.assert_series_equal(result.reindex_like(expected), expected) + + +@pytest.mark.parametrize("op", series_transform_kernels) +def test_transform_partial_failure(op, request): + # GH 35964 + if op in ("ffill", "bfill", "pad", "backfill", "shift"): + request.applymarker( + pytest.mark.xfail(reason=f"{op} is successful on any dtype") + ) + + # Using object makes most transform kernels fail + ser = Series(3 * [object]) + + if op in ("fillna", "ngroup"): + error = ValueError + msg = "Transform function failed" + else: + error = TypeError + msg = "|".join( + [ + "not supported between instances of 'type' and 'type'", + "unsupported operand type", + ] + ) + + with pytest.raises(error, match=msg): + ser.transform([op, "shift"]) + + with pytest.raises(error, match=msg): + ser.transform({"A": op, "B": "shift"}) + + with pytest.raises(error, match=msg): + ser.transform({"A": [op], "B": ["shift"]}) + + with pytest.raises(error, match=msg): + ser.transform({"A": [op, "shift"], "B": [op]}) + + +def test_transform_partial_failure_valueerror(): + # GH 40211 + def noop(x): + return x + + def raising_op(_): + raise ValueError + + ser = Series(3 * [object]) + msg = "Transform function failed" + + with pytest.raises(ValueError, match=msg): + ser.transform([noop, raising_op]) + + with pytest.raises(ValueError, match=msg): + ser.transform({"A": raising_op, "B": noop}) + + with pytest.raises(ValueError, match=msg): + ser.transform({"A": [raising_op], "B": [noop]}) + + with pytest.raises(ValueError, match=msg): + ser.transform({"A": [noop, raising_op], "B": [noop]}) + + +def test_demo(): + # demonstration tests + s = Series(range(6), dtype="int64", name="series") + + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") + tm.assert_series_equal(result, expected) + + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", [str, lambda x: str(x)]) +def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row): + # test that we are evaluating row-by-row first if by_row="compat" + # else vectorized evaluation + result = string_series.apply(func, by_row=by_row) + + if by_row: + expected = string_series.map(func) + tm.assert_series_equal(result, expected) + else: + assert result == str(string_series) + + +def test_agg_evaluate_lambdas(string_series): + # GH53325 + # in the future, the result will be a Series class. + + with tm.assert_produces_warning(FutureWarning): + result = string_series.agg(lambda x: type(x)) + assert isinstance(result, Series) and len(result) == len(string_series) + + with tm.assert_produces_warning(FutureWarning): + result = string_series.agg(type) + assert isinstance(result, Series) and len(result) == len(string_series) + + +@pytest.mark.parametrize("op_name", ["agg", "apply"]) +def test_with_nested_series(datetime_series, op_name): + # GH 2316 + # .agg with a reducer and a transform, what to do + msg = "cannot aggregate" + warning = FutureWarning if op_name == "agg" else None + with tm.assert_produces_warning(warning, match=msg): + # GH52123 + result = getattr(datetime_series, op_name)( + lambda x: Series([x, x**2], index=["x", "x^2"]) + ) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + + +def test_replicate_describe(string_series): + # this also tests a result set that is all scalars + expected = string_series.describe() + result = string_series.apply( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + }, + ) + tm.assert_series_equal(result, expected) + + +def test_reduce(string_series): + # reductions with named functions + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "how, kwds", + [("agg", {}), ("apply", {"by_row": "compat"}), ("apply", {"by_row": False})], +) +def test_non_callable_aggregates(how, kwds): + # test agg using non-callable series attributes + # GH 39116 - expand to apply + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = getattr(s, how)("size", **kwds) + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = getattr(s, how)(["size", "count", "mean"], **kwds) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) + tm.assert_series_equal(result, expected) + + result = getattr(s, how)({"size": "size", "count": "count", "mean": "mean"}, **kwds) + tm.assert_series_equal(result, expected) + + +def test_series_apply_no_suffix_index(by_row): + # GH36189 + s = Series([4] * 3) + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], by_row=by_row) + expected = Series([12, 12, 12], index=["sum", "", ""]) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ), + DataFrame(np.repeat([[1, 2]], 10, axis=0), dtype="int64"), + ), + ], +) +@pytest.mark.parametrize("aware", [True, False]) +def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): + # GH 25959 + # Calling apply on a localized time series should not cause an error + if aware: + index = dti.tz_localize("UTC").index + else: + index = dti.index + result = Series(index).apply(lambda x: Series([1, 2])) + tm.assert_frame_equal(result, exp) + + +@pytest.mark.parametrize( + "by_row, expected", [("compat", Series(np.ones(10), dtype="int64")), (False, 1)] +) +def test_apply_scalar_on_date_time_index_aware_series(by_row, expected): + # GH 25959 + # Calling apply on a localized time series should not cause an error + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10, tz="UTC"), + ) + result = Series(series.index).apply(lambda x: 1, by_row=by_row) + tm.assert_equal(result, expected) + + +def test_apply_to_timedelta(by_row): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(pd.to_timedelta, by_row=by_row) + tm.assert_series_equal(Series(a), b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) + ser = Series(list_of_strings) + b = ser.apply(pd.to_timedelta, by_row=by_row) + tm.assert_series_equal(Series(a), b) + + +@pytest.mark.parametrize( + "ops, names", + [ + ([np.sum], ["sum"]), + ([np.sum, np.mean], ["sum", "mean"]), + (np.array([np.sum]), ["sum"]), + (np.array([np.sum, np.mean]), ["sum", "mean"]), + ], +) +@pytest.mark.parametrize( + "how, kwargs", + [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]], +) +def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): + # GH 39140 + expected = Series({name: op(string_series) for name, op in zip(names, ops)}) + expected.name = "series" + warn = FutureWarning if how == "agg" else None + msg = f"using Series.[{'|'.join(names)}]" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(string_series, how)(ops, **kwargs) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {"A": np.sum}, + {"A": np.sum, "B": np.mean}, + Series({"A": np.sum}), + Series({"A": np.sum, "B": np.mean}), + ], +) +@pytest.mark.parametrize( + "how, kwargs", + [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]], +) +def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row): + # GH 39140 + expected = Series({name: op(string_series) for name, op in ops.items()}) + expected.name = string_series.name + warn = FutureWarning if how == "agg" else None + msg = "using Series.[sum|mean]" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(string_series, how)(ops, **kwargs) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, names", + [ + ([np.sqrt], ["sqrt"]), + ([np.abs, np.sqrt], ["absolute", "sqrt"]), + (np.array([np.sqrt]), ["sqrt"]), + (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), + ], +) +def test_apply_listlike_transformer(string_series, ops, names, by_row): + # GH 39140 + with np.errstate(all="ignore"): + expected = concat([op(string_series) for op in ops], axis=1) + expected.columns = names + result = string_series.apply(ops, by_row=by_row) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, expected", + [ + ([lambda x: x], DataFrame({"": [1, 2, 3]})), + ([lambda x: x.sum()], Series([6], index=[""])), + ], +) +def test_apply_listlike_lambda(ops, expected, by_row): + # GH53400 + ser = Series([1, 2, 3]) + result = ser.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {"A": np.sqrt}, + {"A": np.sqrt, "B": np.exp}, + Series({"A": np.sqrt}), + Series({"A": np.sqrt, "B": np.exp}), + ], +) +def test_apply_dictlike_transformer(string_series, ops, by_row): + # GH 39140 + with np.errstate(all="ignore"): + expected = concat({name: op(string_series) for name, op in ops.items()}) + expected.name = string_series.name + result = string_series.apply(ops, by_row=by_row) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, expected", + [ + ( + {"a": lambda x: x}, + Series([1, 2, 3], index=MultiIndex.from_arrays([["a"] * 3, range(3)])), + ), + ({"a": lambda x: x.sum()}, Series([6], index=["a"])), + ], +) +def test_apply_dictlike_lambda(ops, by_row, expected): + # GH53400 + ser = Series([1, 2, 3]) + result = ser.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +def test_apply_retains_column_name(by_row): + # GH 16380 + df = DataFrame({"x": range(3)}, Index(range(3), name="x")) + result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y"))) + expected = DataFrame( + [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], + columns=Index(range(3), name="y"), + index=Index(range(3), name="x"), + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_type(): + # GH 46719 + s = Series([3, "string", float], index=["a", "b", "c"]) + result = s.apply(type) + expected = Series([int, str, type], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + +def test_series_apply_unpack_nested_data(): + # GH#55189 + ser = Series([[1, 2, 3], [4, 5, 6, 7]]) + result = ser.apply(lambda x: Series(x)) + expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply_relabeling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply_relabeling.py new file mode 100644 index 0000000000000000000000000000000000000000..cdfa054f91c9b67261d715cd7812a53d1b2d4b2f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_apply_relabeling.py @@ -0,0 +1,39 @@ +import pandas as pd +import pandas._testing as tm + + +def test_relabel_no_duplicated_method(): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + msg = "using Series.[sum|min|max]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["B"].agg(foo=sum, bar=min, cat="max") + msg = "using Series.[sum|min|max]" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + +def test_relabel_duplicated_method(): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + msg = "using Series.min" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["B"].agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_transform.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..82592c4711ece5a7f4b6d421d743e1adbd78c345 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_series_transform.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, + Series, + concat, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "args, kwargs, increment", + [((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)], +) +def test_agg_args(args, kwargs, increment): + # GH 43357 + def f(x, a=0, b=0, c=0): + return x + a + 10 * b + 100 * c + + s = Series([1, 2]) + result = s.transform(f, 0, *args, **kwargs) + expected = s + increment + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, names", + [ + ([np.sqrt], ["sqrt"]), + ([np.abs, np.sqrt], ["absolute", "sqrt"]), + (np.array([np.sqrt]), ["sqrt"]), + (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), + ], +) +def test_transform_listlike(string_series, ops, names): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([op(string_series) for op in ops], axis=1) + expected.columns = names + result = string_series.transform(ops) + tm.assert_frame_equal(result, expected) + + +def test_transform_listlike_func_with_args(): + # GH 50624 + + s = Series([1, 2, 3]) + + def foo1(x, a=1, c=0): + return x + a + c + + def foo2(x, b=2, c=0): + return x + b + c + + msg = r"foo1\(\) got an unexpected keyword argument 'b'" + with pytest.raises(TypeError, match=msg): + s.transform([foo1, foo2], 0, 3, b=3, c=4) + + result = s.transform([foo1, foo2], 0, 3, c=4) + expected = DataFrame({"foo1": [8, 9, 10], "foo2": [8, 9, 10]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("box", [dict, Series]) +def test_transform_dictlike(string_series, box): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform(box({"foo": np.sqrt, "bar": np.abs})) + tm.assert_frame_equal(result, expected) + + +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = Series([1, 4]) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_str.py new file mode 100644 index 0000000000000000000000000000000000000000..17e8322dc40e1ef0e65ed6d63a6e4af3a373e29b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/apply/test_str.py @@ -0,0 +1,326 @@ +from itertools import chain +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_number + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.apply.common import ( + frame_transform_kernels, + series_transform_kernels, +) + + +@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) +@pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, True], {"numeric_only": True}, id="args_and_kwds"), + ], +) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): + if len(args) > 1 and how == "agg": + request.applymarker( + pytest.mark.xfail( + raises=TypeError, + reason="agg/apply signature mismatch - agg passes 2nd " + "argument to func", + ) + ) + result = getattr(float_frame, how)(func, *args, **kwds) + expected = getattr(float_frame, func)(*args, **kwds) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("arg", ["sum", "mean", "min", "max", "std"]) +def test_with_string_args(datetime_series, arg): + result = datetime_series.apply(arg) + expected = getattr(datetime_series, arg)() + assert result == expected + + +@pytest.mark.parametrize("op", ["mean", "median", "std", "var"]) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_np_reducer(op, how): + # GH 39116 + float_frame = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = getattr(float_frame, how)(op) + # pandas ddof defaults to 1, numpy to 0 + kwargs = {"ddof": 1} if op in ("std", "var") else {} + expected = Series( + getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"] +) +@pytest.mark.parametrize("how", ["transform", "apply"]) +def test_apply_np_transformer(float_frame, op, how): + # GH 39116 + + # float_frame will _usually_ have negative values, which will + # trigger the warning here, but let's put one in just to be sure + float_frame.iloc[0, 0] = -1.0 + warn = None + if op in ["log", "sqrt"]: + warn = RuntimeWarning + + with tm.assert_produces_warning(warn, check_stacklevel=False): + # float_frame fixture is defined in conftest.py, so we don't check the + # stacklevel as otherwise the test would fail. + result = getattr(float_frame, how)(op) + expected = getattr(np, op)(float_frame) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", True), + ("any", True), + ], + ), + ), +) +def test_agg_cython_table_series(series, func, expected): + # GH21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + result = series.agg(func) + if is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + else: + assert result == expected + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("cumprod", Series([], dtype=np.float64)), + ("cumsum", Series([], dtype=np.float64)), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), +) +def test_agg_cython_table_transform_series(series, func, expected): + # GH21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + result = series.agg(func) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), + [ + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), + ], + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), +) +def test_agg_cython_table_frame(df, func, expected, axis): + # GH 21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"): + # GH#53425 + result = df.agg(func, axis=axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), + ], + ), + ), +) +def test_agg_cython_table_transform_frame(df, func, expected, axis): + # GH 21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis in ("columns", 1): + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"): + # GH#53425 + result = df.agg(func, axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", series_transform_kernels) +def test_transform_groupby_kernel_series(request, string_series, op): + # GH 35964 + if op == "ngroup": + request.applymarker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + args = [0.0] if op == "fillna" else [] + ones = np.ones(string_series.shape[0]) + + warn = FutureWarning if op == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = string_series.groupby(ones).transform(op, *args) + result = string_series.transform(op, 0, *args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", frame_transform_kernels) +def test_transform_groupby_kernel_frame(request, axis, float_frame, op): + if op == "ngroup": + request.applymarker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + + # GH 35964 + + args = [0.0] if op == "fillna" else [] + if axis in (0, "index"): + ones = np.ones(float_frame.shape[0]) + msg = "The 'axis' keyword in DataFrame.groupby is deprecated" + else: + ones = np.ones(float_frame.shape[1]) + msg = "DataFrame.groupby with axis=1 is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = float_frame.groupby(ones, axis=axis) + + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected = gb.transform(op, *args) + + result = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result, expected) + + # same thing, but ensuring we have multiple blocks + assert "E" not in float_frame.columns + float_frame["E"] = float_frame["A"].copy() + assert len(float_frame._mgr.arrays) > 1 + + if axis in (0, "index"): + ones = np.ones(float_frame.shape[0]) + else: + ones = np.ones(float_frame.shape[1]) + with tm.assert_produces_warning(FutureWarning, match=msg): + gb2 = float_frame.groupby(ones, axis=axis) + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected2 = gb2.transform(op, *args) + result2 = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result2, expected2) + + +@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) +def test_transform_method_name(method): + # GH 19760 + df = DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cb6559cddf01b4ebe1bd1c81204b30de595afaf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f0746ce46485fb4f1109dfffaadc725fcc64f40 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e80f23c6d197518f35804167107ba1f6d1a4fc4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_array_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_array_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79363d47e63ab2f5381dfff3d010f16a6a8bfdad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_array_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_categorical.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_categorical.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6238b5251b78502bb182baa83d0b03262c738b4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_categorical.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76a3618c5cad27db77bc90704fddc75bb90703d3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_numeric.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef6fd04e1fc584553512602946ea01b8abb4ba83 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_numeric.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_object.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_object.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e87c870af8419a9f3c8da4de6dcf76b91808f46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_object.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3f9752db56df12bb42eda981b52853f0dccc5cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/__pycache__/test_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_numeric.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_numeric.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c1786b6b422c32a0f396b43f51d75b2b3ffe25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_numeric.py @@ -0,0 +1,1567 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for numeric dtypes +from __future__ import annotations + +from collections import abc +from datetime import timedelta +from decimal import Decimal +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + RangeIndex, + Series, + Timedelta, + TimedeltaIndex, + array, + date_range, +) +import pandas._testing as tm +from pandas.core import ops +from pandas.core.computation import expressions as expr +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, +) + + +@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param + + +@pytest.fixture(params=[Index, Series, tm.to_array]) +def box_pandas_1d_array(request): + """ + Fixture to test behavior for Index, Series and tm.to_array classes + """ + return request.param + + +@pytest.fixture( + params=[ + # TODO: add more dtypes here + Index(np.arange(5, dtype="float64")), + Index(np.arange(5, dtype="int64")), + Index(np.arange(5, dtype="uint64")), + RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) +def numeric_idx(request): + """ + Several types of numeric-dtypes Index objects + """ + return request.param + + +@pytest.fixture( + params=[Index, Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ +) +def box_1d_array(request): + """ + Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list + classes + """ + return request.param + + +def adjust_negative_zero(zero, expected): + """ + Helper to adjust the expected result if we are dividing by -0.0 + as opposed to 0.0 + """ + if np.signbit(np.array(zero)).any(): + # All entries in the `zero` fixture should be either + # all-negative or no-negative. + assert np.signbit(np.array(zero)).all() + + expected *= -1 + + return expected + + +def compare_op(series, other, op): + left = np.abs(series) if op in (ops.rpow, operator.pow) else series + right = np.abs(other) if op in (ops.rpow, operator.pow) else other + + cython_or_numpy = op(left, right) + python = left.combine(right, op) + if isinstance(other, Series) and not other.index.equals(series.index): + python.index = python.index._with_freq(None) + tm.assert_series_equal(cython_or_numpy, python) + + +# TODO: remove this kludge once mypy stops giving false positives here +# List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex] +# See GH#29725 +_ldtypes = ["i1", "i2", "i4", "i8", "u1", "u2", "u4", "u8", "f2", "f4", "f8"] +lefts: list[Index | Series] = [RangeIndex(10, 40, 10)] +lefts.extend([Series([10, 20, 30], dtype=dtype) for dtype in _ldtypes]) +lefts.extend([Index([10, 20, 30], dtype=dtype) for dtype in _ldtypes if dtype != "f2"]) + +# ------------------------------------------------------------------ +# Comparisons + + +class TestNumericComparisons: + def test_operator_series_comparison_zerorank(self): + # GH#13006 + result = np.float64(0) > Series([1, 2, 3]) + expected = 0.0 > Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + result = Series([1, 2, 3]) < np.float64(0) + expected = Series([1, 2, 3]) < 0.0 + tm.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > Series([0, 1, 2]) + expected = 0.0 > Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + + def test_df_numeric_cmp_dt64_raises(self, box_with_array, fixed_now_ts): + # GH#8932, GH#22163 + ts = fixed_now_ts + obj = np.array(range(5)) + obj = tm.box_expected(obj, box_with_array) + + assert_invalid_comparison(obj, ts, box_with_array) + + def test_compare_invalid(self): + # GH#8058 + # ops testing + a = Series(np.random.default_rng(2).standard_normal(5), name=0) + b = Series(np.random.default_rng(2).standard_normal(5)) + b.name = pd.Timestamp("2000-01-01") + tm.assert_series_equal(a / b, 1 / (b / a)) + + def test_numeric_cmp_string_numexpr_path(self, box_with_array, monkeypatch): + # GH#36377, GH#35700 + box = box_with_array + xbox = box if box is not Index else np.ndarray + + obj = Series(np.random.default_rng(2).standard_normal(51)) + obj = tm.box_expected(obj, box, transpose=False) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 50) + result = obj == "a" + + expected = Series(np.zeros(51, dtype=bool)) + expected = tm.box_expected(expected, xbox, transpose=False) + tm.assert_equal(result, expected) + + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 50) + result = obj != "a" + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between dtype=float64 and str" + with pytest.raises(TypeError, match=msg): + obj < "a" + + +# ------------------------------------------------------------------ +# Numeric dtypes Arithmetic with Datetime/Timedelta Scalar + + +class TestNumericArraylikeArithmeticWithDatetimeLike: + @pytest.mark.parametrize("box_cls", [np.array, Index, Series]) + @pytest.mark.parametrize( + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) + ) + def test_mul_td64arr(self, left, box_cls): + # GH#22390 + right = np.array([1, 2, 3], dtype="m8[s]") + right = box_cls(right) + + expected = TimedeltaIndex(["10s", "40s", "90s"], dtype=right.dtype) + + if isinstance(left, Series) or box_cls is Series: + expected = Series(expected) + assert expected.dtype == right.dtype + + result = left * right + tm.assert_equal(result, expected) + + result = right * left + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("box_cls", [np.array, Index, Series]) + @pytest.mark.parametrize( + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) + ) + def test_div_td64arr(self, left, box_cls): + # GH#22390 + right = np.array([10, 40, 90], dtype="m8[s]") + right = box_cls(right) + + expected = TimedeltaIndex(["1s", "2s", "3s"], dtype=right.dtype) + if isinstance(left, Series) or box_cls is Series: + expected = Series(expected) + assert expected.dtype == right.dtype + + result = right / left + tm.assert_equal(result, expected) + + result = right // left + tm.assert_equal(result, expected) + + # (true_) needed for min-versions build 2022-12-26 + msg = "ufunc '(true_)?divide' cannot use operands with types" + with pytest.raises(TypeError, match=msg): + left / right + + msg = "ufunc 'floor_divide' cannot use operands with types" + with pytest.raises(TypeError, match=msg): + left // right + + # TODO: also test Tick objects; + # see test_numeric_arr_rdiv_tdscalar for note on these failing + @pytest.mark.parametrize( + "scalar_td", + [ + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta(), + Timedelta(days=1).to_timedelta64().astype("timedelta64[s]"), + Timedelta(days=1).to_timedelta64().astype("timedelta64[ms]"), + ], + ids=lambda x: type(x).__name__, + ) + def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array): + # GH#19333 + box = box_with_array + index = numeric_idx + expected = TimedeltaIndex([Timedelta(days=n) for n in range(len(index))]) + if isinstance(scalar_td, np.timedelta64): + dtype = scalar_td.dtype + expected = expected.astype(dtype) + elif type(scalar_td) is timedelta: + expected = expected.astype("m8[us]") + + index = tm.box_expected(index, box) + expected = tm.box_expected(expected, box) + + result = index * scalar_td + tm.assert_equal(result, expected) + + commute = scalar_td * index + tm.assert_equal(commute, expected) + + @pytest.mark.parametrize( + "scalar_td", + [ + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta(), + ], + ids=lambda x: type(x).__name__, + ) + @pytest.mark.parametrize("dtype", [np.int64, np.float64]) + def test_numeric_arr_mul_tdscalar_numexpr_path( + self, dtype, scalar_td, box_with_array + ): + # GH#44772 for the float64 case + box = box_with_array + + arr_i8 = np.arange(2 * 10**4).astype(np.int64, copy=False) + arr = arr_i8.astype(dtype, copy=False) + obj = tm.box_expected(arr, box, transpose=False) + + expected = arr_i8.view("timedelta64[D]").astype("timedelta64[ns]") + if type(scalar_td) is timedelta: + expected = expected.astype("timedelta64[us]") + + expected = tm.box_expected(expected, box, transpose=False) + + result = obj * scalar_td + tm.assert_equal(result, expected) + + result = scalar_td * obj + tm.assert_equal(result, expected) + + def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array): + box = box_with_array + + index = numeric_idx[1:3] + + expected = TimedeltaIndex(["3 Days", "36 Hours"]) + if isinstance(three_days, np.timedelta64): + dtype = three_days.dtype + if dtype < np.dtype("m8[s]"): + # i.e. resolution is lower -> use lowest supported resolution + dtype = np.dtype("m8[s]") + expected = expected.astype(dtype) + elif type(three_days) is timedelta: + expected = expected.astype("m8[us]") + elif isinstance( + three_days, + (pd.offsets.Day, pd.offsets.Hour, pd.offsets.Minute, pd.offsets.Second), + ): + # closest reso is Second + expected = expected.astype("m8[s]") + + index = tm.box_expected(index, box) + expected = tm.box_expected(expected, box) + + result = three_days / index + tm.assert_equal(result, expected) + + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): + index / three_days + + @pytest.mark.parametrize( + "other", + [ + Timedelta(hours=31), + Timedelta(hours=31).to_pytimedelta(), + Timedelta(hours=31).to_timedelta64(), + Timedelta(hours=31).to_timedelta64().astype("m8[h]"), + np.timedelta64("NaT"), + np.timedelta64("NaT", "D"), + pd.offsets.Minute(3), + pd.offsets.Second(0), + # GH#28080 numeric+datetimelike should raise; Timestamp used + # to raise NullFrequencyError but that behavior was removed in 1.0 + pd.Timestamp("2021-01-01", tz="Asia/Tokyo"), + pd.Timestamp("2021-01-01"), + pd.Timestamp("2021-01-01").to_pydatetime(), + pd.Timestamp("2021-01-01", tz="UTC").to_pydatetime(), + pd.Timestamp("2021-01-01").to_datetime64(), + np.datetime64("NaT", "ns"), + pd.NaT, + ], + ids=repr, + ) + def test_add_sub_datetimedeltalike_invalid( + self, numeric_idx, other, box_with_array + ): + box = box_with_array + + left = tm.box_expected(numeric_idx, box) + msg = "|".join( + [ + "unsupported operand type", + "Addition/subtraction of integers and integer-arrays", + "Instead of adding/subtracting", + "cannot use operands with types dtype", + "Concatenation operation is not implemented for NumPy arrays", + "Cannot (add|subtract) NaT (to|from) ndarray", + # pd.array vs np.datetime64 case + r"operand type\(s\) all returned NotImplemented from __array_ufunc__", + "can only perform ops with numeric values", + "cannot subtract DatetimeArray from ndarray", + # pd.Timedelta(1) + Index([0, 1, 2]) + "Cannot add or subtract Timedelta from integers", + ] + ) + assert_invalid_addsub_type(left, other, msg) + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestDivisionByZero: + def test_div_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) + + result = idx / zero + tm.assert_index_equal(result, expected2) + ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(expected)) + + def test_floordiv_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) + + result = idx // zero + tm.assert_index_equal(result, expected2) + ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(expected)) + + def test_mod_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + result = idx % zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(result)) + + def test_divmod_zero(self, zero, numeric_idx): + idx = numeric_idx + + exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + exleft = adjust_negative_zero(zero, exleft) + + result = divmod(idx, zero) + tm.assert_index_equal(result[0], exleft) + tm.assert_index_equal(result[1], exright) + + @pytest.mark.parametrize("op", [operator.truediv, operator.floordiv]) + def test_div_negative_zero(self, zero, numeric_idx, op): + # Check that -1 / -0.0 returns np.inf, not -np.inf + if numeric_idx.dtype == np.uint64: + pytest.skip(f"Div by negative 0 not relevant for {numeric_idx.dtype}") + idx = numeric_idx - 3 + + expected = Index([-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64) + expected = adjust_negative_zero(zero, expected) + + result = op(idx, zero) + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------------ + + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) + def test_ser_div_ser( + self, + switch_numexpr_min_elements, + dtype1, + any_real_numpy_dtype, + ): + # no longer do integer div for any ops, but deal with the 0's + dtype2 = any_real_numpy_dtype + + first = Series([3, 4, 5, 8], name="first").astype(dtype1) + second = Series([0, 0, 0, 3], name="second").astype(dtype2) + + with np.errstate(all="ignore"): + expected = Series( + first.values.astype(np.float64) / second.values, + dtype="float64", + name=None, + ) + expected.iloc[0:3] = np.inf + if first.dtype == "int64" and second.dtype == "float32": + # when using numexpr, the casting rules are slightly different + # and int64/float32 combo results in float32 instead of float64 + if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: + expected = expected.astype("float32") + + result = first / second + tm.assert_series_equal(result, expected) + assert not result.equals(second / first) + + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) + def test_ser_divmod_zero(self, dtype1, any_real_numpy_dtype): + # GH#26987 + dtype2 = any_real_numpy_dtype + left = Series([1, 1]).astype(dtype1) + right = Series([0, 2]).astype(dtype2) + + # GH#27321 pandas convention is to set 1 // 0 to np.inf, as opposed + # to numpy which sets to np.nan; patch `expected[0]` below + expected = left // right, left % right + expected = list(expected) + expected[0] = expected[0].astype(np.float64) + expected[0][0] = np.inf + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + def test_ser_divmod_inf(self): + left = Series([np.inf, 1.0]) + right = Series([np.inf, 2.0]) + + expected = left // right, left % right + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + def test_rdiv_zero_compat(self): + # GH#8674 + zero_array = np.array([0] * 5) + data = np.random.default_rng(2).standard_normal(5) + expected = Series([0.0] * 5) + + result = zero_array / Series(data) + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / data + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / Series(data) + tm.assert_series_equal(result, expected) + + def test_div_zero_inf_signs(self): + # GH#9144, inf signing + ser = Series([-1, 0, 1], name="first") + expected = Series([-np.inf, np.nan, np.inf], name="first") + + result = ser / 0 + tm.assert_series_equal(result, expected) + + def test_rdiv_zero(self): + # GH#9144 + ser = Series([-1, 0, 1], name="first") + expected = Series([0.0, np.nan, 0.0], name="first") + + result = 0 / ser + tm.assert_series_equal(result, expected) + + def test_floordiv_div(self): + # GH#9144 + ser = Series([-1, 0, 1], name="first") + + result = ser // 0 + expected = Series([-np.inf, np.nan, np.inf], name="first") + tm.assert_series_equal(result, expected) + + def test_df_div_zero_df(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = df / df + + first = Series([1.0, 1.0, 1.0, 1.0]) + second = Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({"first": first, "second": second}) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_array(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + first = Series([1.0, 1.0, 1.0, 1.0]) + second = Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({"first": first, "second": second}) + + with np.errstate(all="ignore"): + arr = df.values.astype("float") / df.values + result = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_int(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + result = df / 0 + expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) + expected.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values.astype("float64") / 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_div_zero_series_does_not_commute(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + ser = df[0] + res = ser / df + res2 = df / ser + assert not res.fillna(0).equals(res2.fillna(0)) + + # ------------------------------------------------------------------ + # Mod By Zero + + def test_df_mod_zero_df(self, using_array_manager): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + # this is technically wrong, as the integer portion is coerced to float + first = Series([0, 0, 0, 0]) + if not using_array_manager: + # INFO(ArrayManager) BlockManager doesn't preserve dtype per column + # while ArrayManager performs op column-wisedoes and thus preserves + # dtype if possible + first = first.astype("float64") + second = Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + result = df % df + tm.assert_frame_equal(result, expected) + + # GH#38939 If we dont pass copy=False, df is consolidated and + # result["first"] is float64 instead of int64 + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False) + first = Series([0, 0, 0, 0], dtype="int64") + second = Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + result = df % df + tm.assert_frame_equal(result, expected) + + def test_df_mod_zero_array(self): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = Series([0, 0, 0, 0], dtype="float64") + second = Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values % df.values + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns, dtype="float64") + result2.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_int(self): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + result = df % 0 + expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values.astype("float64") % 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_series_does_not_commute(self): + # GH#3590, modulo as ints + # not commutative with series + df = pd.DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + ser = df[0] + res = ser % df + res2 = df % ser + assert not res.fillna(0).equals(res2.fillna(0)) + + +class TestMultiplicationDivision: + # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ + # for non-timestamp/timedelta/period dtypes + + def test_divide_decimal(self, box_with_array): + # resolves issue GH#9787 + box = box_with_array + ser = Series([Decimal(10)]) + expected = Series([Decimal(5)]) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = ser / Decimal(2) + + tm.assert_equal(result, expected) + + result = ser // Decimal(2) + tm.assert_equal(result, expected) + + def test_div_equiv_binop(self): + # Test Series.div as well as Series.__div__ + # float/integer issue + # GH#7785 + first = Series([1, 0], name="first") + second = Series([-0.01, -0.02], name="second") + expected = Series([-0.01, -np.inf]) + + result = second.div(first) + tm.assert_series_equal(result, expected, check_names=False) + + result = second / first + tm.assert_series_equal(result, expected) + + def test_div_int(self, numeric_idx): + idx = numeric_idx + result = idx / 1 + expected = idx.astype("float64") + tm.assert_index_equal(result, expected) + + result = idx / 2 + expected = Index(idx.values / 2) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) + def test_mul_int_identity(self, op, numeric_idx, box_with_array): + idx = numeric_idx + idx = tm.box_expected(idx, box_with_array) + + result = op(idx, 1) + tm.assert_equal(result, idx) + + def test_mul_int_array(self, numeric_idx): + idx = numeric_idx + didx = idx * idx + + result = idx * np.array(5, dtype="int64") + tm.assert_index_equal(result, idx * 5) + + arr_dtype = "uint64" if idx.dtype == np.uint64 else "int64" + result = idx * np.arange(5, dtype=arr_dtype) + tm.assert_index_equal(result, didx) + + def test_mul_int_series(self, numeric_idx): + idx = numeric_idx + didx = idx * idx + + arr_dtype = "uint64" if idx.dtype == np.uint64 else "int64" + result = idx * Series(np.arange(5, dtype=arr_dtype)) + tm.assert_series_equal(result, Series(didx)) + + def test_mul_float_series(self, numeric_idx): + idx = numeric_idx + rng5 = np.arange(5, dtype="float64") + + result = idx * Series(rng5 + 0.1) + expected = Series(rng5 * (rng5 + 0.1)) + tm.assert_series_equal(result, expected) + + def test_mul_index(self, numeric_idx): + idx = numeric_idx + + result = idx * idx + tm.assert_index_equal(result, idx**2) + + def test_mul_datelike_raises(self, numeric_idx): + idx = numeric_idx + msg = "cannot perform __rmul__ with this index type" + with pytest.raises(TypeError, match=msg): + idx * date_range("20130101", periods=5) + + def test_mul_size_mismatch_raises(self, numeric_idx): + idx = numeric_idx + msg = "operands could not be broadcast together" + with pytest.raises(ValueError, match=msg): + idx * idx[0:3] + with pytest.raises(ValueError, match=msg): + idx * np.array([1, 2]) + + @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) + def test_pow_float(self, op, numeric_idx, box_with_array): + # test power calculations both ways, GH#14973 + box = box_with_array + idx = numeric_idx + expected = Index(op(idx.values, 2.0)) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, box) + + result = op(idx, 2.0) + tm.assert_equal(result, expected) + + def test_modulo(self, numeric_idx, box_with_array): + # GH#9244 + box = box_with_array + idx = numeric_idx + expected = Index(idx.values % 2) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, box) + + result = idx % 2 + tm.assert_equal(result, expected) + + def test_divmod_scalar(self, numeric_idx): + idx = numeric_idx + + result = divmod(idx, 2) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, 2) + + expected = Index(div), Index(mod) + for r, e in zip(result, expected): + tm.assert_index_equal(r, e) + + def test_divmod_ndarray(self, numeric_idx): + idx = numeric_idx + other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 + + result = divmod(idx, other) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, other) + + expected = Index(div), Index(mod) + for r, e in zip(result, expected): + tm.assert_index_equal(r, e) + + def test_divmod_series(self, numeric_idx): + idx = numeric_idx + other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 + + result = divmod(idx, Series(other)) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, other) + + expected = Series(div), Series(mod) + for r, e in zip(result, expected): + tm.assert_series_equal(r, e) + + @pytest.mark.parametrize("other", [np.nan, 7, -23, 2.718, -3.14, np.inf]) + def test_ops_np_scalar(self, other): + vals = np.random.default_rng(2).standard_normal((5, 3)) + f = lambda x: pd.DataFrame( + x, index=list("ABCDE"), columns=["jim", "joe", "jolie"] + ) + + df = f(vals) + + tm.assert_frame_equal(df / np.array(other), f(vals / other)) + tm.assert_frame_equal(np.array(other) * df, f(vals * other)) + tm.assert_frame_equal(df + np.array(other), f(vals + other)) + tm.assert_frame_equal(np.array(other) - df, f(other - vals)) + + # TODO: This came from series.test.test_operators, needs cleanup + def test_operators_frame(self): + # rpow does not work with DataFrame + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + ts.name = "ts" + + df = pd.DataFrame({"A": ts}) + + tm.assert_series_equal(ts + ts, ts + df["A"], check_names=False) + tm.assert_series_equal(ts**ts, ts ** df["A"], check_names=False) + tm.assert_series_equal(ts < ts, ts < df["A"], check_names=False) + tm.assert_series_equal(ts / ts, ts / df["A"], check_names=False) + + # TODO: this came from tests.series.test_analytics, needs cleanup and + # de-duplication with test_modulo above + def test_modulo2(self): + with np.errstate(all="ignore"): + # GH#3590, modulo as ints + p = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values, dtype="float64") + expected.iloc[0:3] = np.nan + tm.assert_series_equal(result, expected) + + result = p["first"] % 0 + expected = Series(np.nan, index=p.index, name="first") + tm.assert_series_equal(result, expected) + + p = p.astype("float64") + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values) + tm.assert_series_equal(result, expected) + + p = p.astype("float64") + result = p["first"] % p["second"] + result2 = p["second"] % p["first"] + assert not result.equals(result2) + + def test_modulo_zero_int(self): + # GH#9144 + with np.errstate(all="ignore"): + s = Series([0, 1]) + + result = s % 0 + expected = Series([np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + result = 0 % s + expected = Series([np.nan, 0.0]) + tm.assert_series_equal(result, expected) + + +class TestAdditionSubtraction: + # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__ + # for non-timestamp/timedelta/period dtypes + + @pytest.mark.parametrize( + "first, second, expected", + [ + ( + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2], index=list("ABD"), name="x"), + Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x"), + ), + ( + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2, 2], index=list("ABCD"), name="x"), + Series([3, 4, 5, np.nan], index=list("ABCD"), name="x"), + ), + ], + ) + def test_add_series(self, first, second, expected): + # GH#1134 + tm.assert_series_equal(first + second, expected) + tm.assert_series_equal(second + first, expected) + + @pytest.mark.parametrize( + "first, second, expected", + [ + ( + pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")), + pd.DataFrame({"x": [2, 2, 2]}, index=list("ABD")), + pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")), + ), + ( + pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")), + pd.DataFrame({"x": [2, 2, 2, 2]}, index=list("ABCD")), + pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")), + ), + ], + ) + def test_add_frames(self, first, second, expected): + # GH#1134 + tm.assert_frame_equal(first + second, expected) + tm.assert_frame_equal(second + first, expected) + + # TODO: This came from series.test.test_operators, needs cleanup + def test_series_frame_radd_bug(self, fixed_now_ts): + # GH#353 + vals = Series([str(i) for i in range(5)]) + result = "foo_" + vals + expected = vals.map(lambda x: "foo_" + x) + tm.assert_series_equal(result, expected) + + frame = pd.DataFrame({"vals": vals}) + result = "foo_" + frame + expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)}) + tm.assert_frame_equal(result, expected) + + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + + # really raise this time + fix_now = fixed_now_ts.to_pydatetime() + msg = "|".join( + [ + "unsupported operand type", + # wrong error message, see https://github.com/numpy/numpy/issues/18832 + "Concatenation operation", + ] + ) + with pytest.raises(TypeError, match=msg): + fix_now + ts + + with pytest.raises(TypeError, match=msg): + ts + fix_now + + # TODO: This came from series.test.test_operators, needs cleanup + def test_datetime64_with_index(self): + # arithmetic integer ops with an index + ser = Series(np.random.default_rng(2).standard_normal(5)) + expected = ser - ser.index.to_series() + result = ser - ser.index + tm.assert_series_equal(result, expected) + + # GH#4629 + # arithmetic datetime64 ops with an index + ser = Series( + date_range("20130101", periods=5), + index=date_range("20130101", periods=5), + ) + expected = ser - ser.index.to_series() + result = ser - ser.index + tm.assert_series_equal(result, expected) + + msg = "cannot subtract PeriodArray from DatetimeArray" + with pytest.raises(TypeError, match=msg): + # GH#18850 + result = ser - ser.index.to_period() + + df = pd.DataFrame( + np.random.default_rng(2).standard_normal((5, 2)), + index=date_range("20130101", periods=5), + ) + df["date"] = pd.Timestamp("20130102") + df["expected"] = df["date"] - df.index.to_series() + df["result"] = df["date"] - df.index + tm.assert_series_equal(df["result"], df["expected"], check_names=False) + + # TODO: taken from tests.frame.test_operators, needs cleanup + def test_frame_operators(self, float_frame): + frame = float_frame + + garbage = np.random.default_rng(2).random(4) + colSeries = Series(garbage, index=np.array(frame.columns)) + + idSum = frame + frame + seriesSum = frame + colSeries + + for col, series in idSum.items(): + for idx, val in series.items(): + origVal = frame[col][idx] * 2 + if not np.isnan(val): + assert val == origVal + else: + assert np.isnan(origVal) + + for col, series in seriesSum.items(): + for idx, val in series.items(): + origVal = frame[col][idx] + colSeries[col] + if not np.isnan(val): + assert val == origVal + else: + assert np.isnan(origVal) + + def test_frame_operators_col_align(self, float_frame): + frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"]) + added = frame2 + frame2 + expected = frame2 * 2 + tm.assert_frame_equal(added, expected) + + def test_frame_operators_none_to_nan(self): + df = pd.DataFrame({"a": ["a", None, "b"]}) + tm.assert_frame_equal(df + df, pd.DataFrame({"a": ["aa", np.nan, "bb"]})) + + @pytest.mark.parametrize("dtype", ("float", "int64")) + def test_frame_operators_empty_like(self, dtype): + # Test for issue #10181 + frames = [ + pd.DataFrame(dtype=dtype), + pd.DataFrame(columns=["A"], dtype=dtype), + pd.DataFrame(index=[0], dtype=dtype), + ] + for df in frames: + assert (df + df).equals(df) + tm.assert_frame_equal(df + df, df) + + @pytest.mark.parametrize( + "func", + [lambda x: x * 2, lambda x: x[::2], lambda x: 5], + ids=["multiply", "slice", "constant"], + ) + def test_series_operators_arithmetic(self, all_arithmetic_functions, func): + op = all_arithmetic_functions + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + other = func(series) + compare_op(series, other, op) + + @pytest.mark.parametrize( + "func", [lambda x: x + 1, lambda x: 5], ids=["add", "constant"] + ) + def test_series_operators_compare(self, comparison_op, func): + op = comparison_op + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + other = func(series) + compare_op(series, other, op) + + @pytest.mark.parametrize( + "func", + [lambda x: x * 2, lambda x: x[::2], lambda x: 5], + ids=["multiply", "slice", "constant"], + ) + def test_divmod(self, func): + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + other = func(series) + results = divmod(series, other) + if isinstance(other, abc.Iterable) and len(series) != len(other): + # if the lengths don't match, this is the test where we use + # `tser[::2]`. Pad every other value in `other_np` with nan. + other_np = [] + for n in other: + other_np.append(n) + other_np.append(np.nan) + else: + other_np = other + other_np = np.asarray(other_np) + with np.errstate(all="ignore"): + expecteds = divmod(series.values, np.asarray(other_np)) + + for result, expected in zip(results, expecteds): + # check the values, name, and index separately + tm.assert_almost_equal(np.asarray(result), expected) + + assert result.name == series.name + tm.assert_index_equal(result.index, series.index._with_freq(None)) + + def test_series_divmod_zero(self): + # Check that divmod uses pandas convention for division by zero, + # which does not match numpy. + # pandas convention has + # 1/0 == np.inf + # -1/0 == -np.inf + # 1/-0.0 == -np.inf + # -1/-0.0 == np.inf + tser = Series( + np.arange(1, 11, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + other = tser * 0 + + result = divmod(tser, other) + exp1 = Series([np.inf] * len(tser), index=tser.index, name="ts") + exp2 = Series([np.nan] * len(tser), index=tser.index, name="ts") + tm.assert_series_equal(result[0], exp1) + tm.assert_series_equal(result[1], exp2) + + +class TestUFuncCompat: + # TODO: add more dtypes + @pytest.mark.parametrize("holder", [Index, RangeIndex, Series]) + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_ufunc_compat(self, holder, dtype): + box = Series if holder is Series else Index + + if holder is RangeIndex: + if dtype != np.int64: + pytest.skip(f"dtype {dtype} not relevant for RangeIndex") + idx = RangeIndex(0, 5, name="foo") + else: + idx = holder(np.arange(5, dtype=dtype), name="foo") + result = np.sin(idx) + expected = box(np.sin(np.arange(5, dtype=dtype)), name="foo") + tm.assert_equal(result, expected) + + # TODO: add more dtypes + @pytest.mark.parametrize("holder", [Index, Series]) + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_ufunc_coercions(self, holder, dtype): + idx = holder([1, 2, 3, 4, 5], dtype=dtype, name="x") + box = Series if holder is Series else Index + + result = np.sqrt(idx) + assert result.dtype == "f8" and isinstance(result, box) + exp = Index(np.sqrt(np.array([1, 2, 3, 4, 5], dtype=np.float64)), name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = np.divide(idx, 2.0) + assert result.dtype == "f8" and isinstance(result, box) + exp = Index([0.5, 1.0, 1.5, 2.0, 2.5], dtype=np.float64, name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + # _evaluate_numeric_binop + result = idx + 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = Index([3.0, 4.0, 5.0, 6.0, 7.0], dtype=np.float64, name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx - 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = Index([-1.0, 0.0, 1.0, 2.0, 3.0], dtype=np.float64, name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx * 1.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = Index([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float64, name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx / 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = Index([0.5, 1.0, 1.5, 2.0, 2.5], dtype=np.float64, name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + # TODO: add more dtypes + @pytest.mark.parametrize("holder", [Index, Series]) + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_ufunc_multiple_return_values(self, holder, dtype): + obj = holder([1, 2, 3], dtype=dtype, name="x") + box = Series if holder is Series else Index + + result = np.modf(obj) + assert isinstance(result, tuple) + exp1 = Index([0.0, 0.0, 0.0], dtype=np.float64, name="x") + exp2 = Index([1.0, 2.0, 3.0], dtype=np.float64, name="x") + tm.assert_equal(result[0], tm.box_expected(exp1, box)) + tm.assert_equal(result[1], tm.box_expected(exp2, box)) + + def test_ufunc_at(self): + s = Series([0, 1, 2], index=[1, 2, 3], name="x") + np.add.at(s, [0, 2], 10) + expected = Series([10, 1, 12], index=[1, 2, 3], name="x") + tm.assert_series_equal(s, expected) + + +class TestObjectDtypeEquivalence: + # Tests that arithmetic operations match operations executed elementwise + + @pytest.mark.parametrize("dtype", [None, object]) + def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): + box = box_with_array + ser = Series([1, 2, 3], dtype=dtype) + expected = Series([np.nan, np.nan, np.nan], dtype=dtype) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = np.nan + ser + tm.assert_equal(result, expected) + + result = ser + np.nan + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_numarr_with_dtype_add_int(self, dtype, box_with_array): + box = box_with_array + ser = Series([1, 2, 3], dtype=dtype) + expected = Series([2, 3, 4], dtype=dtype) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = 1 + ser + tm.assert_equal(result, expected) + + result = ser + 1 + tm.assert_equal(result, expected) + + # TODO: moved from tests.series.test_operators; needs cleanup + @pytest.mark.parametrize( + "op", + [operator.add, operator.sub, operator.mul, operator.truediv, operator.floordiv], + ) + def test_operators_reverse_object(self, op): + # GH#56 + arr = Series( + np.random.default_rng(2).standard_normal(10), + index=np.arange(10), + dtype=object, + ) + + result = op(1.0, arr) + expected = op(1.0, arr.astype(float)) + tm.assert_series_equal(result.astype(float), expected) + + +class TestNumericArithmeticUnsorted: + # Tests in this class have been moved from type-specific test modules + # but not yet sorted, parametrized, and de-duplicated + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + ], + ) + @pytest.mark.parametrize( + "idx1", + [ + RangeIndex(0, 10, 1), + RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), + RangeIndex(5, -5, -1), + ], + ) + @pytest.mark.parametrize( + "idx2", + [ + RangeIndex(0, 10, 1), + RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), + RangeIndex(5, -5, -1), + ], + ) + def test_binops_index(self, op, idx1, idx2): + idx1 = idx1._rename("foo") + idx2 = idx2._rename("bar") + result = op(idx1, idx2) + expected = op(Index(idx1.to_numpy()), Index(idx2.to_numpy())) + tm.assert_index_equal(result, expected, exact="equiv") + + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + ], + ) + @pytest.mark.parametrize( + "idx", + [ + RangeIndex(0, 10, 1), + RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), + RangeIndex(5, -5, -1), + ], + ) + @pytest.mark.parametrize("scalar", [-1, 1, 2]) + def test_binops_index_scalar(self, op, idx, scalar): + result = op(idx, scalar) + expected = op(Index(idx.to_numpy()), scalar) + tm.assert_index_equal(result, expected, exact="equiv") + + @pytest.mark.parametrize("idx1", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)]) + @pytest.mark.parametrize("idx2", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)]) + def test_binops_index_pow(self, idx1, idx2): + # numpy does not allow powers of negative integers so test separately + # https://github.com/numpy/numpy/pull/8127 + idx1 = idx1._rename("foo") + idx2 = idx2._rename("bar") + result = pow(idx1, idx2) + expected = pow(Index(idx1.to_numpy()), Index(idx2.to_numpy())) + tm.assert_index_equal(result, expected, exact="equiv") + + @pytest.mark.parametrize("idx", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)]) + @pytest.mark.parametrize("scalar", [1, 2]) + def test_binops_index_scalar_pow(self, idx, scalar): + # numpy does not allow powers of negative integers so test separately + # https://github.com/numpy/numpy/pull/8127 + result = pow(idx, scalar) + expected = pow(Index(idx.to_numpy()), scalar) + tm.assert_index_equal(result, expected, exact="equiv") + + # TODO: divmod? + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + operator.pow, + operator.mod, + ], + ) + def test_arithmetic_with_frame_or_series(self, op): + # check that we return NotImplemented when operating with Series + # or DataFrame + index = RangeIndex(5) + other = Series(np.random.default_rng(2).standard_normal(5)) + + expected = op(Series(index), other) + result = op(index, other) + tm.assert_series_equal(result, expected) + + other = pd.DataFrame(np.random.default_rng(2).standard_normal((2, 5))) + expected = op(pd.DataFrame([index, index]), other) + result = op(index, other) + tm.assert_frame_equal(result, expected) + + def test_numeric_compat2(self): + # validate that we are handling the RangeIndex overrides to numeric ops + # and returning RangeIndex where possible + + idx = RangeIndex(0, 10, 2) + + result = idx * 2 + expected = RangeIndex(0, 20, 4) + tm.assert_index_equal(result, expected, exact=True) + + result = idx + 2 + expected = RangeIndex(2, 12, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = idx - 2 + expected = RangeIndex(-2, 8, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = idx / 2 + expected = RangeIndex(0, 5, 1).astype("float64") + tm.assert_index_equal(result, expected, exact=True) + + result = idx / 4 + expected = RangeIndex(0, 10, 2) / 4 + tm.assert_index_equal(result, expected, exact=True) + + result = idx // 1 + expected = idx + tm.assert_index_equal(result, expected, exact=True) + + # __mul__ + result = idx * idx + expected = Index(idx.values * idx.values) + tm.assert_index_equal(result, expected, exact=True) + + # __pow__ + idx = RangeIndex(0, 1000, 2) + result = idx**2 + expected = Index(idx._values) ** 2 + tm.assert_index_equal(Index(result.values), expected, exact=True) + + @pytest.mark.parametrize( + "idx, div, expected", + [ + # TODO: add more dtypes + (RangeIndex(0, 1000, 2), 2, RangeIndex(0, 500, 1)), + (RangeIndex(-99, -201, -3), -3, RangeIndex(33, 67, 1)), + ( + RangeIndex(0, 1000, 1), + 2, + Index(RangeIndex(0, 1000, 1)._values) // 2, + ), + ( + RangeIndex(0, 100, 1), + 2.0, + Index(RangeIndex(0, 100, 1)._values) // 2.0, + ), + (RangeIndex(0), 50, RangeIndex(0)), + (RangeIndex(2, 4, 2), 3, RangeIndex(0, 1, 1)), + (RangeIndex(-5, -10, -6), 4, RangeIndex(-2, -1, 1)), + (RangeIndex(-100, -200, 3), 2, RangeIndex(0)), + ], + ) + def test_numeric_compat2_floordiv(self, idx, div, expected): + # __floordiv__ + tm.assert_index_equal(idx // div, expected, exact=True) + + @pytest.mark.parametrize("dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("delta", [1, 0, -1]) + def test_addsub_arithmetic(self, dtype, delta): + # GH#8142 + delta = dtype(delta) + index = Index([10, 11, 12], dtype=dtype) + result = index + delta + expected = Index(index.values + delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + # this subtraction used to fail + result = index - delta + expected = Index(index.values - delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + tm.assert_index_equal(index + index, 2 * index) + tm.assert_index_equal(index - index, 0 * index) + assert not (index - index).empty + + def test_pow_nan_with_zero(self, box_with_array): + left = Index([np.nan, np.nan, np.nan]) + right = Index([0, 0, 0]) + expected = Index([1.0, 1.0, 1.0]) + + left = tm.box_expected(left, box_with_array) + right = tm.box_expected(right, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = left**right + tm.assert_equal(result, expected) + + +def test_fill_value_inf_masking(): + # GH #27464 make sure we mask 0/1 with Inf and not NaN + df = pd.DataFrame({"A": [0, 1, 2], "B": [1.1, None, 1.1]}) + + other = pd.DataFrame({"A": [1.1, 1.2, 1.3]}, index=[0, 2, 3]) + + result = df.rfloordiv(other, fill_value=1) + + expected = pd.DataFrame( + {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_div_silenced(): + # GH#26793 + pdf1 = pd.DataFrame( + { + "A": np.arange(10), + "B": [np.nan, 1, 2, 3, 4] * 2, + "C": [np.nan] * 10, + "D": np.arange(10), + }, + index=list("abcdefghij"), + columns=list("ABCD"), + ) + pdf2 = pd.DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + index=list("abcdefghjk"), + columns=list("ABCX"), + ) + with tm.assert_produces_warning(None): + pdf1.div(pdf2, fill_value=0) + + +@pytest.mark.parametrize( + "data, expected_data", + [([0, 1, 2], [0, 2, 4])], +) +def test_integer_array_add_list_like( + box_pandas_1d_array, box_1d_array, data, expected_data +): + # GH22606 Verify operators with IntegerArray and list-likes + arr = array(data, dtype="Int64") + container = box_pandas_1d_array(arr) + left = container + box_1d_array(data) + right = box_1d_array(data) + container + + if Series in [box_1d_array, box_pandas_1d_array]: + cls = Series + elif Index in [box_1d_array, box_pandas_1d_array]: + cls = Index + else: + cls = array + + expected = cls(expected_data, dtype="Int64") + + tm.assert_equal(left, expected) + tm.assert_equal(right, expected) + + +def test_sub_multiindex_swapped_levels(): + # GH 9952 + df = pd.DataFrame( + {"a": np.random.default_rng(2).standard_normal(6)}, + index=pd.MultiIndex.from_product( + [["a", "b"], [0, 1, 2]], names=["levA", "levB"] + ), + ) + df2 = df.copy() + df2.index = df2.index.swaplevel(0, 1) + result = df - df2 + expected = pd.DataFrame([0.0] * 6, columns=["a"], index=df.index) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("power", [1, 2, 5]) +@pytest.mark.parametrize("string_size", [0, 1, 2, 5]) +def test_empty_str_comparison(power, string_size): + # GH 37348 + a = np.array(range(10**power)) + right = pd.DataFrame(a, dtype=np.int64) + left = " " * string_size + + result = right == left + expected = pd.DataFrame(np.zeros(right.shape, dtype=bool)) + tm.assert_frame_equal(result, expected) + + +def test_series_add_sub_with_UInt64(): + # GH 22023 + series1 = Series([1, 2, 3]) + series2 = Series([2, 1, 3], dtype="UInt64") + + result = series1 + series2 + expected = Series([3, 3, 6], dtype="Float64") + tm.assert_series_equal(result, expected) + + result = series1 - series2 + expected = Series([-1, 1, 0], dtype="Float64") + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_object.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_object.py new file mode 100644 index 0000000000000000000000000000000000000000..4ffd76722286ab0e6729334216652f4613d9769f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_object.py @@ -0,0 +1,420 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for object dtype +import datetime +from decimal import Decimal +import operator + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Series, + Timestamp, + option_context, +) +import pandas._testing as tm +from pandas.core import ops + +# ------------------------------------------------------------------ +# Comparisons + + +class TestObjectComparisons: + def test_comparison_object_numeric_nas(self, comparison_op): + ser = Series(np.random.default_rng(2).standard_normal(10), dtype=object) + shifted = ser.shift(2) + + func = comparison_op + + result = func(ser, shifted) + expected = func(ser.astype(float), shifted.astype(float)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_object_comparisons(self, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["a", "b", np.nan, "c", "a"]) + + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) + + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) + + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_more_na_comparisons(self, dtype): + left = Series(["a", np.nan, "c"], dtype=dtype) + right = Series(["a", np.nan, "d"], dtype=dtype) + + result = left == right + expected = Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = left != right + expected = Series([False, True, True]) + tm.assert_series_equal(result, expected) + + result = left == np.nan + expected = Series([False, False, False]) + tm.assert_series_equal(result, expected) + + result = left != np.nan + expected = Series([True, True, True]) + tm.assert_series_equal(result, expected) + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestArithmetic: + def test_add_period_to_array_of_offset(self): + # GH#50162 + per = pd.Period("2012-1-1", freq="D") + pi = pd.period_range("2012-1-1", periods=10, freq="D") + idx = per - pi + + expected = pd.Index([x + per for x in idx], dtype=object) + result = idx + per + tm.assert_index_equal(result, expected) + + result = per + idx + tm.assert_index_equal(result, expected) + + # TODO: parametrize + def test_pow_ops_object(self): + # GH#22922 + # pow is weird with masking & 1, so testing here + a = Series([1, np.nan, 1, np.nan], dtype=object) + b = Series([1, np.nan, np.nan, 1], dtype=object) + result = a**b + expected = Series(a.values**b.values, dtype=object) + tm.assert_series_equal(result, expected) + + result = b**a + expected = Series(b.values**a.values, dtype=object) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + @pytest.mark.parametrize("other", ["category", "Int64"]) + def test_add_extension_scalar(self, other, box_with_array, op): + # GH#22378 + # Check that scalars satisfying is_extension_array_dtype(obj) + # do not incorrectly try to dispatch to an ExtensionArray operation + + arr = Series(["a", "b", "c"]) + expected = Series([op(x, other) for x in arr]) + + arr = tm.box_expected(arr, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = op(arr, other) + tm.assert_equal(result, expected) + + def test_objarr_add_str(self, box_with_array): + ser = Series(["x", np.nan, "x"]) + expected = Series(["xa", np.nan, "xa"]) + + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = ser + "a" + tm.assert_equal(result, expected) + + def test_objarr_radd_str(self, box_with_array): + ser = Series(["x", np.nan, "x"]) + expected = Series(["ax", np.nan, "ax"]) + + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = "a" + ser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT], + ["x", "y", 1], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) + def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): + ser = Series(data, dtype=dtype) + + ser = tm.box_expected(ser, box_with_array) + msg = "|".join( + [ + "can only concatenate str", + "did not contain a loop with signature matching types", + "unsupported operand type", + "must be str", + ] + ) + with pytest.raises(TypeError, match=msg): + "foo_" + ser + + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) + def test_objarr_add_invalid(self, op, box_with_array): + # invalid ops + box = box_with_array + + obj_ser = Series(list("abc"), dtype=object, name="objects") + + obj_ser = tm.box_expected(obj_ser, box) + msg = "|".join( + [ + "can only concatenate str", + "unsupported operand type", + "must be str", + "has no kernel", + ] + ) + with pytest.raises(Exception, match=msg): + op(obj_ser, 1) + with pytest.raises(Exception, match=msg): + op(obj_ser, np.array(1, dtype=np.int64)) + + # TODO: Moved from tests.series.test_operators; needs cleanup + def test_operators_na_handling(self): + ser = Series(["foo", "bar", "baz", np.nan]) + result = "prefix_" + ser + expected = Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) + tm.assert_series_equal(result, expected) + + result = ser + "_suffix" + expected = Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) + tm.assert_series_equal(result, expected) + + # TODO: parametrize over box + @pytest.mark.parametrize("dtype", [None, object]) + def test_series_with_dtype_radd_timedelta(self, dtype): + # note this test is _not_ aimed at timedelta64-dtyped Series + # as of 2.0 we retain object dtype when ser.dtype == object + ser = Series( + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + dtype=dtype, + ) + expected = Series( + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")], + dtype=dtype, + ) + + result = pd.Timedelta("3 days") + ser + tm.assert_series_equal(result, expected) + + result = ser + pd.Timedelta("3 days") + tm.assert_series_equal(result, expected) + + # TODO: cleanup & parametrize over box + def test_mixed_timezone_series_ops_object(self): + # GH#13043 + ser = Series( + [ + Timestamp("2015-01-01", tz="US/Eastern"), + Timestamp("2015-01-01", tz="Asia/Tokyo"), + ], + name="xxx", + ) + assert ser.dtype == object + + exp = Series( + [ + Timestamp("2015-01-02", tz="US/Eastern"), + Timestamp("2015-01-02", tz="Asia/Tokyo"), + ], + name="xxx", + ) + tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp) + tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) + + # object series & object series + ser2 = Series( + [ + Timestamp("2015-01-03", tz="US/Eastern"), + Timestamp("2015-01-05", tz="Asia/Tokyo"), + ], + name="xxx", + ) + assert ser2.dtype == object + exp = Series( + [pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object + ) + tm.assert_series_equal(ser2 - ser, exp) + tm.assert_series_equal(ser - ser2, -exp) + + ser = Series( + [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], + name="xxx", + dtype=object, + ) + assert ser.dtype == object + + exp = Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], + name="xxx", + dtype=object, + ) + tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) + tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) + + # TODO: cleanup & parametrize over box + def test_iadd_preserves_name(self): + # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name + ser = Series([1, 2, 3]) + ser.index.name = "foo" + + ser.index += 1 + assert ser.index.name == "foo" + + ser.index -= 1 + assert ser.index.name == "foo" + + def test_add_string(self): + # from bug report + index = pd.Index(["a", "b", "c"]) + index2 = index + "foo" + + assert "a" not in index2 + assert "afoo" in index2 + + def test_iadd_string(self): + index = pd.Index(["a", "b", "c"]) + # doesn't fail test unless there is a check before `+=` + assert "a" in index + + index += "_x" + assert "a_x" in index + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") + def test_add(self): + index = pd.Index([str(i) for i in range(10)]) + expected = pd.Index(index.values * 2) + tm.assert_index_equal(index + index, expected) + tm.assert_index_equal(index + index.tolist(), expected) + tm.assert_index_equal(index.tolist() + index, expected) + + # test add and radd + index = pd.Index(list("abc")) + expected = pd.Index(["a1", "b1", "c1"]) + tm.assert_index_equal(index + "1", expected) + expected = pd.Index(["1a", "1b", "1c"]) + tm.assert_index_equal("1" + index, expected) + + def test_sub_fail(self, using_infer_string): + index = pd.Index([str(i) for i in range(10)]) + + if using_infer_string: + import pyarrow as pa + + err = pa.lib.ArrowNotImplementedError + msg = "has no kernel" + else: + err = TypeError + msg = "unsupported operand type|Cannot broadcast" + with pytest.raises(err, match=msg): + index - "a" + with pytest.raises(err, match=msg): + index - index + with pytest.raises(err, match=msg): + index - index.tolist() + with pytest.raises(err, match=msg): + index.tolist() - index + + def test_sub_object(self): + # GH#19369 + index = pd.Index([Decimal(1), Decimal(2)]) + expected = pd.Index([Decimal(0), Decimal(1)]) + + result = index - Decimal(1) + tm.assert_index_equal(result, expected) + + result = index - pd.Index([Decimal(1), Decimal(1)]) + tm.assert_index_equal(result, expected) + + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): + index - "foo" + + with pytest.raises(TypeError, match=msg): + index - np.array([2, "foo"], dtype=object) + + def test_rsub_object(self, fixed_now_ts): + # GH#19369 + index = pd.Index([Decimal(1), Decimal(2)]) + expected = pd.Index([Decimal(1), Decimal(0)]) + + result = Decimal(2) - index + tm.assert_index_equal(result, expected) + + result = np.array([Decimal(2), Decimal(2)]) - index + tm.assert_index_equal(result, expected) + + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): + "foo" - index + + with pytest.raises(TypeError, match=msg): + np.array([True, fixed_now_ts]) - index + + +class MyIndex(pd.Index): + # Simple index subclass that tracks ops calls. + + _calls: int + + @classmethod + def _simple_new(cls, values, name=None, dtype=None): + result = object.__new__(cls) + result._data = values + result._name = name + result._calls = 0 + result._reset_identity() + + return result + + def __add__(self, other): + self._calls += 1 + return self._simple_new(self._data) + + def __radd__(self, other): + return self.__add__(other) + + +@pytest.mark.parametrize( + "other", + [ + [datetime.timedelta(1), datetime.timedelta(2)], + [datetime.datetime(2000, 1, 1), datetime.datetime(2000, 1, 2)], + [pd.Period("2000"), pd.Period("2001")], + ["a", "b"], + ], + ids=["timedelta", "datetime", "period", "object"], +) +def test_index_ops_defer_to_unknown_subclasses(other): + # https://github.com/pandas-dev/pandas/issues/31109 + values = np.array( + [datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)], dtype=object + ) + a = MyIndex._simple_new(values) + other = pd.Index(other) + result = other + a + assert isinstance(result, MyIndex) + assert a._calls == 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_period.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_period.py new file mode 100644 index 0000000000000000000000000000000000000000..5535fe8ff928d10b994bd6556229e0163a358ab0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_period.py @@ -0,0 +1,1675 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for Period dtype +import operator + +import numpy as np +import pytest + +from pandas._libs.tslibs import ( + IncompatibleFrequency, + Period, + Timestamp, + to_offset, +) +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + period_range, +) +import pandas._testing as tm +from pandas.core import ops +from pandas.core.arrays import TimedeltaArray +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) + +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] + + +@pytest.fixture( + params=[ + Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + Timedelta(seconds=30), + ] + + _common_mismatch +) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + Timedelta(days=365).to_pytimedelta(), + Timedelta(days=365), + ] + + _common_mismatch +) +def mismatched_freq(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + +# ------------------------------------------------------------------ +# Comparisons + + +class TestPeriodArrayLikeComparisons: + # Comparison tests for PeriodDtype vectors fully parametrized over + # DataFrame/Series/PeriodIndex/PeriodArray. Ideally all comparison + # tests will eventually end up here. + + @pytest.mark.parametrize("other", ["2017", Period("2017", freq="D")]) + def test_eq_scalar(self, other, box_with_array): + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + idx = tm.box_expected(idx, box_with_array) + xbox = get_upcast_box(idx, other, True) + + expected = np.array([True, True, False]) + expected = tm.box_expected(expected, xbox) + + result = idx == other + + tm.assert_equal(result, expected) + + def test_compare_zerodim(self, box_with_array): + # GH#26689 make sure we unbox zero-dimensional arrays + + pi = period_range("2000", periods=4) + other = np.array(pi.to_numpy()[0]) + + pi = tm.box_expected(pi, box_with_array) + xbox = get_upcast_box(pi, other, True) + + result = pi <= other + expected = np.array([True, False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "scalar", + [ + "foo", + Timestamp("2021-01-01"), + Timedelta(days=4), + 9, + 9.5, + 2000, # specifically don't consider 2000 to match Period("2000", "D") + False, + None, + ], + ) + def test_compare_invalid_scalar(self, box_with_array, scalar): + # GH#28980 + # comparison with scalar that cannot be interpreted as a Period + pi = period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, scalar, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("1D", periods=4).array, + np.arange(4), + np.arange(4).astype(np.float64), + list(range(4)), + # match Period semantics by not treating integers as Periods + [2000, 2001, 2002, 2003], + np.arange(2000, 2004), + np.arange(2000, 2004).astype(object), + pd.Index([2000, 2001, 2002, 2003]), + ], + ) + def test_compare_invalid_listlike(self, box_with_array, other): + pi = period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, other, box_with_array) + + @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) + def test_compare_object_dtype(self, box_with_array, other_box): + pi = period_range("2000", periods=5) + parr = tm.box_expected(pi, box_with_array) + + other = other_box(pi) + xbox = get_upcast_box(parr, other, True) + + expected = np.array([True, True, True, True, True]) + expected = tm.box_expected(expected, xbox) + + result = parr == other + tm.assert_equal(result, expected) + result = parr <= other + tm.assert_equal(result, expected) + result = parr >= other + tm.assert_equal(result, expected) + + result = parr != other + tm.assert_equal(result, ~expected) + result = parr < other + tm.assert_equal(result, ~expected) + result = parr > other + tm.assert_equal(result, ~expected) + + other = other_box(pi[::-1]) + + expected = np.array([False, False, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr == other + tm.assert_equal(result, expected) + + expected = np.array([True, True, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr <= other + tm.assert_equal(result, expected) + + expected = np.array([False, False, True, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr >= other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr != other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr < other + tm.assert_equal(result, expected) + + expected = np.array([False, False, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr > other + tm.assert_equal(result, expected) + + +class TestPeriodIndexComparisons: + # TODO: parameterize over boxes + + def test_pi_cmp_period(self): + idx = period_range("2007-01", periods=20, freq="M") + per = idx[10] + + result = idx < per + exp = idx.values < idx.values[10] + tm.assert_numpy_array_equal(result, exp) + + # Tests Period.__richcmp__ against ndarray[object, ndim=2] + result = idx.values.reshape(10, 2) < per + tm.assert_numpy_array_equal(result, exp.reshape(10, 2)) + + # Tests Period.__richcmp__ against ndarray[object, ndim=0] + result = idx < np.array(per) + tm.assert_numpy_array_equal(result, exp) + + # TODO: moved from test_datetime64; de-duplicate with version below + def test_parr_cmp_period_scalar2(self, box_with_array): + pi = period_range("2000-01-01", periods=10, freq="D") + + val = pi[3] + expected = [x > val for x in pi] + + ser = tm.box_expected(pi, box_with_array) + xbox = get_upcast_box(ser, val, True) + + expected = tm.box_expected(expected, xbox) + result = ser > val + tm.assert_equal(result, expected) + + val = pi[5] + result = ser > val + expected = [x > val for x in pi] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_period_scalar(self, freq, box_with_array): + # GH#13200 + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + per = Period("2011-02", freq=freq) + xbox = get_upcast_box(base, per, True) + + exp = np.array([False, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base == per, exp) + tm.assert_equal(per == base, exp) + + exp = np.array([True, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base != per, exp) + tm.assert_equal(per != base, exp) + + exp = np.array([False, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base > per, exp) + tm.assert_equal(per < base, exp) + + exp = np.array([True, False, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base < per, exp) + tm.assert_equal(per > base, exp) + + exp = np.array([False, True, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base >= per, exp) + tm.assert_equal(per <= base, exp) + + exp = np.array([True, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base <= per, exp) + tm.assert_equal(per >= base, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_pi(self, freq, box_with_array): + # GH#13200 + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + + # TODO: could also box idx? + idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq) + + xbox = get_upcast_box(base, idx, True) + + exp = np.array([False, False, True, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base == idx, exp) + + exp = np.array([True, True, False, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base != idx, exp) + + exp = np.array([False, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base > idx, exp) + + exp = np.array([True, False, False, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base < idx, exp) + + exp = np.array([False, True, True, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base >= idx, exp) + + exp = np.array([True, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base <= idx, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_pi_mismatched_freq(self, freq, box_with_array): + # GH#13200 + # different base freq + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + + msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period" + with pytest.raises(TypeError, match=msg): + base <= Period("2011", freq="Y") + + with pytest.raises(TypeError, match=msg): + Period("2011", freq="Y") >= base + + # TODO: Could parametrize over boxes for idx? + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="Y") + rev_msg = r"Invalid comparison between dtype=period\[Y-DEC\] and PeriodArray" + idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg + with pytest.raises(TypeError, match=idx_msg): + base <= idx + + # Different frequency + msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period" + with pytest.raises(TypeError, match=msg): + base <= Period("2011", freq="4M") + + with pytest.raises(TypeError, match=msg): + Period("2011", freq="4M") >= base + + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") + rev_msg = r"Invalid comparison between dtype=period\[4M\] and PeriodArray" + idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg + with pytest.raises(TypeError, match=idx_msg): + base <= idx + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_pi_cmp_nat(self, freq): + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) + per = idx1[1] + + result = idx1 > per + exp = np.array([False, False, False, True]) + tm.assert_numpy_array_equal(result, exp) + result = per < idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == pd.NaT + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + result = pd.NaT == idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != pd.NaT + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + result = pd.NaT != idx1 + tm.assert_numpy_array_equal(result, exp) + + idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq) + result = idx1 < idx2 + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx2 + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx2 + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx1 + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx1 + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_pi_cmp_nat_mismatched_freq_raises(self, freq): + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) + + diff = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq="4M") + msg = rf"Invalid comparison between dtype=period\[{freq}\] and PeriodArray" + with pytest.raises(TypeError, match=msg): + idx1 > diff + + result = idx1 == diff + expected = np.array([False, False, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + # TODO: De-duplicate with test_pi_cmp_nat + @pytest.mark.parametrize("dtype", [object, None]) + def test_comp_nat(self, dtype): + left = PeriodIndex([Period("2011-01-01"), pd.NaT, Period("2011-01-03")]) + right = PeriodIndex([pd.NaT, pd.NaT, Period("2011-01-03")]) + + if dtype is not None: + left = left.astype(dtype) + right = right.astype(dtype) + + result = left == right + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = left != right + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == right, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(left != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != left, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > left, expected) + + +class TestPeriodSeriesComparisons: + def test_cmp_series_period_series_mixed_freq(self): + # GH#13200 + base = Series( + [ + Period("2011", freq="Y"), + Period("2011-02", freq="M"), + Period("2013", freq="Y"), + Period("2011-04", freq="M"), + ] + ) + + ser = Series( + [ + Period("2012", freq="Y"), + Period("2011-01", freq="M"), + Period("2013", freq="Y"), + Period("2011-05", freq="M"), + ] + ) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == ser, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != ser, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > ser, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < ser, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= ser, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= ser, exp) + + +class TestPeriodIndexSeriesComparisonConsistency: + """Test PeriodIndex and Period Series Ops consistency""" + + # TODO: needs parametrization+de-duplication + + def _check(self, values, func, expected): + # Test PeriodIndex and Period Series Ops consistency + + idx = PeriodIndex(values) + result = func(idx) + + # check that we don't pass an unwanted type to tm.assert_equal + assert isinstance(expected, (pd.Index, np.ndarray)) + tm.assert_equal(result, expected) + + s = Series(values) + result = func(s) + + exp = Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_comp_period(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + per = idx[2] + + f = lambda x: x == per + exp = np.array([False, False, True, False], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: per == x + self._check(idx, f, exp) + + f = lambda x: x != per + exp = np.array([True, True, False, True], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: per != x + self._check(idx, f, exp) + + f = lambda x: per >= x + exp = np.array([True, True, True, False], dtype=np.bool_) + self._check(idx, f, exp) + + f = lambda x: x > per + exp = np.array([False, False, False, True], dtype=np.bool_) + self._check(idx, f, exp) + + f = lambda x: per >= x + exp = np.array([True, True, True, False], dtype=np.bool_) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) + per = idx[2] + + f = lambda x: x == per + exp = np.array([False, False, True, False], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: per == x + self._check(idx, f, exp) + + f = lambda x: x == pd.NaT + exp = np.array([False, False, False, False], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: pd.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != per + exp = np.array([True, True, False, True], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: per != x + self._check(idx, f, exp) + + f = lambda x: x != pd.NaT + exp = np.array([True, True, True, True], dtype=np.bool_) + self._check(idx, f, exp) + f = lambda x: pd.NaT != x + self._check(idx, f, exp) + + f = lambda x: per >= x + exp = np.array([True, False, True, False], dtype=np.bool_) + self._check(idx, f, exp) + + f = lambda x: x < per + exp = np.array([True, False, False, False], dtype=np.bool_) + self._check(idx, f, exp) + + f = lambda x: x > pd.NaT + exp = np.array([False, False, False, False], dtype=np.bool_) + self._check(idx, f, exp) + + f = lambda x: pd.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool_) + self._check(idx, f, exp) + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestPeriodFrameArithmetic: + def test_ops_frame_period(self): + # GH#13043 + df = pd.DataFrame( + { + "A": [Period("2015-01", freq="M"), Period("2015-02", freq="M")], + "B": [Period("2014-01", freq="M"), Period("2014-02", freq="M")], + } + ) + assert df["A"].dtype == "Period[M]" + assert df["B"].dtype == "Period[M]" + + p = Period("2015-03", freq="M") + off = p.freq + # dtype will be object because of original dtype + exp = pd.DataFrame( + { + "A": np.array([2 * off, 1 * off], dtype=object), + "B": np.array([14 * off, 13 * off], dtype=object), + } + ) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -1 * exp) + + df2 = pd.DataFrame( + { + "A": [Period("2015-05", freq="M"), Period("2015-06", freq="M")], + "B": [Period("2015-05", freq="M"), Period("2015-06", freq="M")], + } + ) + assert df2["A"].dtype == "Period[M]" + assert df2["B"].dtype == "Period[M]" + + exp = pd.DataFrame( + { + "A": np.array([4 * off, 4 * off], dtype=object), + "B": np.array([16 * off, 16 * off], dtype=object), + } + ) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -1 * exp) + + +class TestPeriodIndexArithmetic: + # --------------------------------------------------------------- + # __add__/__sub__ with PeriodIndex + # PeriodIndex + other is defined for integers and timedelta-like others + # PeriodIndex - other is defined for integers, timedelta-like others, + # and PeriodIndex (with matching freq) + + def test_parr_add_iadd_parr_raises(self, box_with_array): + rng = period_range("1/1/2000", freq="D", periods=5) + other = period_range("1/6/2000", freq="D", periods=5) + # TODO: parametrize over boxes for other? + + rng = tm.box_expected(rng, box_with_array) + # An earlier implementation of PeriodIndex addition performed + # a set operation (union). This has since been changed to + # raise a TypeError. See GH#14164 and GH#13077 for historical + # reference. + msg = r"unsupported operand type\(s\) for \+: .* and .*" + with pytest.raises(TypeError, match=msg): + rng + other + + with pytest.raises(TypeError, match=msg): + rng += other + + def test_pi_sub_isub_pi(self): + # GH#20049 + # For historical reference see GH#14164, GH#13077. + # PeriodIndex subtraction originally performed set difference, + # then changed to raise TypeError before being implemented in GH#20049 + rng = period_range("1/1/2000", freq="D", periods=5) + other = period_range("1/6/2000", freq="D", periods=5) + + off = rng.freq + expected = pd.Index([-5 * off] * 5) + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_pi_sub_pi_with_nat(self): + rng = period_range("1/1/2000", freq="D", periods=5) + other = rng[1:].insert(0, pd.NaT) + assert other[1:].equals(rng[1:]) + + result = rng - other + off = rng.freq + expected = pd.Index([pd.NaT, 0 * off, 0 * off, 0 * off, 0 * off]) + tm.assert_index_equal(result, expected) + + def test_parr_sub_pi_mismatched_freq(self, box_with_array, box_with_array2): + rng = period_range("1/1/2000", freq="D", periods=5) + other = period_range("1/6/2000", freq="h", periods=5) + + rng = tm.box_expected(rng, box_with_array) + other = tm.box_expected(other, box_with_array2) + msg = r"Input has different freq=[hD] from PeriodArray\(freq=[Dh]\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH 23878 + p1_d = "19910905" + p2_d = "19920406" + p1 = PeriodIndex([p1_d], freq=tick_classes(n)) + p2 = PeriodIndex([p2_d], freq=tick_classes(n)) + + expected = PeriodIndex([p2_d], freq=p2.freq.base) - PeriodIndex( + [p1_d], freq=p1.freq.base + ) + + tm.assert_index_equal((p2 - p1), expected) + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): + # GH 23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + freq = offset(n, normalize=False, **kwds) + p1 = PeriodIndex([p1_d], freq=freq) + p2 = PeriodIndex([p2_d], freq=freq) + + result = p2 - p1 + expected = PeriodIndex([p2_d], freq=freq.base) - PeriodIndex( + [p1_d], freq=freq.base + ) + + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------- + # Invalid Operations + + @pytest.mark.parametrize( + "other", + [ + # datetime scalars + Timestamp("2016-01-01"), + Timestamp("2016-01-01").to_pydatetime(), + Timestamp("2016-01-01").to_datetime64(), + # datetime-like arrays + pd.date_range("2016-01-01", periods=3, freq="h"), + pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), + pd.date_range("2016-01-01", periods=3, freq="s")._data, + pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, + # Miscellaneous invalid types + 3.14, + np.array([2.0, 3.0, 4.0]), + ], + ) + def test_parr_add_sub_invalid(self, other, box_with_array): + # GH#23215 + rng = period_range("1/1/2000", freq="D", periods=3) + rng = tm.box_expected(rng, box_with_array) + + msg = "|".join( + [ + r"(:?cannot add PeriodArray and .*)", + r"(:?cannot subtract .* from (:?a\s)?.*)", + r"(:?unsupported operand type\(s\) for \+: .* and .*)", + r"unsupported operand type\(s\) for [+-]: .* and .*", + ] + ) + assert_invalid_addsub_type(rng, other, msg) + with pytest.raises(TypeError, match=msg): + rng + other + with pytest.raises(TypeError, match=msg): + other + rng + with pytest.raises(TypeError, match=msg): + rng - other + with pytest.raises(TypeError, match=msg): + other - rng + + # ----------------------------------------------------------------- + # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] + + def test_pi_add_sub_td64_array_non_tick_raises(self): + rng = period_range("1/1/2000", freq="Q", periods=3) + tdi = TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) + tdarr = tdi.values + + msg = r"Cannot add or subtract timedelta64\[ns\] dtype from period\[Q-DEC\]" + with pytest.raises(TypeError, match=msg): + rng + tdarr + with pytest.raises(TypeError, match=msg): + tdarr + rng + + with pytest.raises(TypeError, match=msg): + rng - tdarr + msg = r"cannot subtract PeriodArray from TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdarr - rng + + def test_pi_add_sub_td64_array_tick(self): + # PeriodIndex + Timedelta-like is allowed only with + # tick-like frequencies + rng = period_range("1/1/2000", freq="90D", periods=3) + tdi = TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) + tdarr = tdi.values + + expected = period_range("12/31/1999", freq="90D", periods=3) + result = rng + tdi + tm.assert_index_equal(result, expected) + result = rng + tdarr + tm.assert_index_equal(result, expected) + result = tdi + rng + tm.assert_index_equal(result, expected) + result = tdarr + rng + tm.assert_index_equal(result, expected) + + expected = period_range("1/2/2000", freq="90D", periods=3) + + result = rng - tdi + tm.assert_index_equal(result, expected) + result = rng - tdarr + tm.assert_index_equal(result, expected) + + msg = r"cannot subtract .* from .*" + with pytest.raises(TypeError, match=msg): + tdarr - rng + + with pytest.raises(TypeError, match=msg): + tdi - rng + + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) + def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): + box = box_with_array + xbox = box if box not in [pd.array, tm.to_array] else pd.Index + + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi + pi = dti.to_period(pi_freq) + + # TODO: parametrize over box for pi? + td64obj = tm.box_expected(tdi, box) + + if pi_freq == "h": + result = pi - td64obj + expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + # Subtract from scalar + result = pi[0] - td64obj + expected = (pi[0].to_timestamp("s") - tdi).to_period(pi_freq) + expected = tm.box_expected(expected, box) + tm.assert_equal(result, expected) + + elif pi_freq == "D": + # Tick, but non-compatible + msg = ( + "Cannot add/subtract timedelta-like from PeriodArray that is " + "not an integer multiple of the PeriodArray's freq." + ) + with pytest.raises(IncompatibleFrequency, match=msg): + pi - td64obj + + with pytest.raises(IncompatibleFrequency, match=msg): + pi[0] - td64obj + + else: + # With non-Tick freq, we could not add timedelta64 array regardless + # of what its resolution is + msg = "Cannot add or subtract timedelta64" + with pytest.raises(TypeError, match=msg): + pi - td64obj + with pytest.raises(TypeError, match=msg): + pi[0] - td64obj + + # ----------------------------------------------------------------- + # operations with array/Index of DateOffset objects + + @pytest.mark.parametrize("box", [np.array, pd.Index]) + def test_pi_add_offset_array(self, box): + # GH#18849 + pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")]) + offs = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]).astype(object) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi + offs + tm.assert_index_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = offs + pi + tm.assert_index_equal(res2, expected) + + unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + # addition/subtraction ops with incompatible offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + msg = r"Input cannot be converted to Period\(freq=Q-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + pi + unanchored + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + unanchored + pi + + @pytest.mark.parametrize("box", [np.array, pd.Index]) + def test_pi_sub_offset_array(self, box): + # GH#18824 + pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")]) + other = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + + expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) + expected = expected.astype(object) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi - other + tm.assert_index_equal(res, expected) + + anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + + # addition/subtraction ops with anchored offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + msg = r"Input has different freq=-1M from Period\(freq=Q-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + pi - anchored + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + anchored - pi + + def test_pi_add_iadd_int(self, one): + # Variants of `one` for #19012 + rng = period_range("2000-01-01 09:00", freq="h", periods=10) + result = rng + one + expected = period_range("2000-01-01 10:00", freq="h", periods=10) + tm.assert_index_equal(result, expected) + rng += one + tm.assert_index_equal(rng, expected) + + def test_pi_sub_isub_int(self, one): + """ + PeriodIndex.__sub__ and __isub__ with several representations of + the integer 1, e.g. int, np.int64, np.uint8, ... + """ + rng = period_range("2000-01-01 09:00", freq="h", periods=10) + result = rng - one + expected = period_range("2000-01-01 08:00", freq="h", periods=10) + tm.assert_index_equal(result, expected) + rng -= one + tm.assert_index_equal(rng, expected) + + @pytest.mark.parametrize("five", [5, np.array(5, dtype=np.int64)]) + def test_pi_sub_intlike(self, five): + rng = period_range("2007-01", periods=50) + + result = rng - five + exp = rng + (-five) + tm.assert_index_equal(result, exp) + + def test_pi_add_sub_int_array_freqn_gt1(self): + # GH#47209 test adding array of ints when freq.n > 1 matches + # scalar behavior + pi = period_range("2016-01-01", periods=10, freq="2D") + arr = np.arange(10) + result = pi + arr + expected = pd.Index([x + y for x, y in zip(pi, arr)]) + tm.assert_index_equal(result, expected) + + result = pi - arr + expected = pd.Index([x - y for x, y in zip(pi, arr)]) + tm.assert_index_equal(result, expected) + + def test_pi_sub_isub_offset(self): + # offset + # DateOffset + rng = period_range("2014", "2024", freq="Y") + result = rng - pd.offsets.YearEnd(5) + expected = period_range("2009", "2019", freq="Y") + tm.assert_index_equal(result, expected) + rng -= pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + rng = period_range("2014-01", "2016-12", freq="M") + result = rng - pd.offsets.MonthEnd(5) + expected = period_range("2013-08", "2016-07", freq="M") + tm.assert_index_equal(result, expected) + + rng -= pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + @pytest.mark.parametrize("transpose", [True, False]) + def test_pi_add_offset_n_gt1(self, box_with_array, transpose): + # GH#23215 + # add offset to PeriodIndex with freq.n > 1 + + per = Period("2016-01", freq="2M") + pi = PeriodIndex([per]) + + expected = PeriodIndex(["2016-03"], freq="2M") + + pi = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) + + result = pi + per.freq + tm.assert_equal(result, expected) + + result = per.freq + pi + tm.assert_equal(result, expected) + + def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): + # GH#23215 + # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0 + pi = PeriodIndex(["2016-01"], freq="2M") + expected = PeriodIndex(["2016-04"], freq="2M") + + pi = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = pi + to_offset("3ME") + tm.assert_equal(result, expected) + + result = to_offset("3ME") + pi + tm.assert_equal(result, expected) + + # --------------------------------------------------------------- + # __add__/__sub__ with integer arrays + + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_pi_add_intarray(self, int_holder, op): + # GH#19959 + pi = PeriodIndex([Period("2015Q1"), Period("NaT")]) + other = int_holder([4, -1]) + + result = op(pi, other) + expected = PeriodIndex([Period("2016Q1"), Period("NaT")]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + def test_pi_sub_intarray(self, int_holder): + # GH#19959 + pi = PeriodIndex([Period("2015Q1"), Period("NaT")]) + other = int_holder([4, -1]) + + result = pi - other + expected = PeriodIndex([Period("2014Q1"), Period("NaT")]) + tm.assert_index_equal(result, expected) + + msg = r"bad operand type for unary -: 'PeriodArray'" + with pytest.raises(TypeError, match=msg): + other - pi + + # --------------------------------------------------------------- + # Timedelta-like (timedelta, timedelta64, Timedelta, Tick) + # TODO: Some of these are misnomers because of non-Tick DateOffsets + + def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): + # GH#23031 adding a time-delta-like offset to a PeriodArray that has + # minute frequency with n != 1. A more general case is tested below + # in test_pi_add_timedeltalike_tick_gt1, but here we write out the + # expected result more explicitly. + other = three_days + rng = period_range("2014-05-01", periods=3, freq="2D") + rng = tm.box_expected(rng, box_with_array) + + expected = PeriodIndex(["2014-05-04", "2014-05-06", "2014-05-08"], freq="2D") + expected = tm.box_expected(expected, box_with_array) + + result = rng + other + tm.assert_equal(result, expected) + + result = other + rng + tm.assert_equal(result, expected) + + # subtraction + expected = PeriodIndex(["2014-04-28", "2014-04-30", "2014-05-02"], freq="2D") + expected = tm.box_expected(expected, box_with_array) + result = rng - other + tm.assert_equal(result, expected) + + msg = "|".join( + [ + r"bad operand type for unary -: 'PeriodArray'", + r"cannot subtract PeriodArray from timedelta64\[[hD]\]", + ] + ) + with pytest.raises(TypeError, match=msg): + other - rng + + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) + def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): + # GH#23031 adding a time-delta-like offset to a PeriodArray that has + # tick-like frequency with n != 1 + other = three_days + rng = period_range("2014-05-01", periods=6, freq=freqstr) + first = rng[0] + rng = tm.box_expected(rng, box_with_array) + + expected = period_range(first + other, periods=6, freq=freqstr) + expected = tm.box_expected(expected, box_with_array) + + result = rng + other + tm.assert_equal(result, expected) + + result = other + rng + tm.assert_equal(result, expected) + + # subtraction + expected = period_range(first - other, periods=6, freq=freqstr) + expected = tm.box_expected(expected, box_with_array) + result = rng - other + tm.assert_equal(result, expected) + msg = "|".join( + [ + r"bad operand type for unary -: 'PeriodArray'", + r"cannot subtract PeriodArray from timedelta64\[[hD]\]", + ] + ) + with pytest.raises(TypeError, match=msg): + other - rng + + def test_pi_add_iadd_timedeltalike_daily(self, three_days): + # Tick + other = three_days + rng = period_range("2014-05-01", "2014-05-15", freq="D") + expected = period_range("2014-05-04", "2014-05-18", freq="D") + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_pi_sub_isub_timedeltalike_daily(self, three_days): + # Tick-like 3 Days + other = three_days + rng = period_range("2014-05-01", "2014-05-15", freq="D") + expected = period_range("2014-04-28", "2014-05-12", freq="D") + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_parr_add_sub_timedeltalike_freq_mismatch_daily( + self, not_daily, box_with_array + ): + other = not_daily + rng = period_range("2014-05-01", "2014-05-15", freq="D") + rng = tm.box_expected(rng, box_with_array) + + msg = "|".join( + [ + # non-timedelta-like DateOffset + "Input has different freq(=.+)? from Period.*?\\(freq=D\\)", + # timedelta/td64/Timedelta but not a multiple of 24H + "Cannot add/subtract timedelta-like from PeriodArray that is " + "not an integer multiple of the PeriodArray's freq.", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="h") + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_parr_add_timedeltalike_mismatched_freq_hourly( + self, not_hourly, box_with_array + ): + other = not_hourly + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + rng = tm.box_expected(rng, box_with_array) + msg = "|".join( + [ + # non-timedelta-like DateOffset + "Input has different freq(=.+)? from Period.*?\\(freq=h\\)", + # timedelta/td64/Timedelta but not a multiple of 24H + "Cannot add/subtract timedelta-like from PeriodArray that is " + "not an integer multiple of the PeriodArray's freq.", + ] + ) + + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + + def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="h") + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_add_iadd_timedeltalike_annual(self): + # offset + # DateOffset + rng = period_range("2014", "2024", freq="Y") + result = rng + pd.offsets.YearEnd(5) + expected = period_range("2019", "2029", freq="Y") + tm.assert_index_equal(result, expected) + rng += pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): + other = mismatched_freq + rng = period_range("2014", "2024", freq="Y") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=Y-DEC\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + def test_pi_add_iadd_timedeltalike_M(self): + rng = period_range("2014-01", "2016-12", freq="M") + expected = period_range("2014-06", "2017-05", freq="M") + + result = rng + pd.offsets.MonthEnd(5) + tm.assert_index_equal(result, expected) + + rng += pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): + other = mismatched_freq + rng = period_range("2014-01", "2016-12", freq="M") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=M\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + @pytest.mark.parametrize("transpose", [True, False]) + def test_parr_add_sub_td64_nat(self, box_with_array, transpose): + # GH#23320 special handling for timedelta64("NaT") + pi = period_range("1994-04-01", periods=9, freq="19D") + other = np.timedelta64("NaT") + expected = PeriodIndex(["NaT"] * 9, freq="19D") + + obj = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + msg = r"cannot subtract .* from .*" + with pytest.raises(TypeError, match=msg): + other - obj + + @pytest.mark.parametrize( + "other", + [ + np.array(["NaT"] * 9, dtype="m8[ns]"), + TimedeltaArray._from_sequence(["NaT"] * 9, dtype="m8[ns]"), + ], + ) + def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): + pi = period_range("1994-04-01", periods=9, freq="19D") + expected = PeriodIndex(["NaT"] * 9, freq="19D") + + obj = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + msg = r"cannot subtract .* from .*" + with pytest.raises(TypeError, match=msg): + other - obj + + # some but not *all* NaT + other = other.copy() + other[0] = np.timedelta64(0, "ns") + expected = PeriodIndex([pi[0]] + ["NaT"] * 8, freq="19D") + expected = tm.box_expected(expected, box_with_array) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + with pytest.raises(TypeError, match=msg): + other - obj + + # --------------------------------------------------------------- + # Unsorted + + def test_parr_add_sub_index(self): + # Check that PeriodArray defers to Index on arithmetic ops + pi = period_range("2000-12-31", periods=3) + parr = pi.array + + result = parr - pi + expected = pi - pi + tm.assert_index_equal(result, expected) + + def test_parr_add_sub_object_array(self): + pi = period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + )._data.astype(object) + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) + tm.assert_equal(result, expected) + + def test_period_add_timestamp_raises(self, box_with_array): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + arr = pd.Index([per], dtype="Period[M]") + arr = tm.box_expected(arr, box_with_array) + + msg = "cannot add PeriodArray and Timestamp" + with pytest.raises(TypeError, match=msg): + arr + ts + with pytest.raises(TypeError, match=msg): + ts + arr + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + Series([ts]) + with pytest.raises(TypeError, match=msg): + Series([ts]) + arr + with pytest.raises(TypeError, match=msg): + arr + pd.Index([ts]) + with pytest.raises(TypeError, match=msg): + pd.Index([ts]) + arr + + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + pd.DataFrame([ts]) + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" + with pytest.raises(TypeError, match=msg): + pd.DataFrame([ts]) + arr + + +class TestPeriodSeriesArithmetic: + def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): + # GH#13043 + ser = Series( + [Period("2015-01-01", freq="D"), Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" + + expected = Series( + [Period("2015-01-04", freq="D"), Period("2015-01-05", freq="D")], + name="xxx", + ) + + obj = tm.box_expected(ser, box_with_array) + if box_with_array is pd.DataFrame: + assert (obj.dtypes == "Period[D]").all() + + expected = tm.box_expected(expected, box_with_array) + + result = obj + three_days + tm.assert_equal(result, expected) + + result = three_days + obj + tm.assert_equal(result, expected) + + def test_ops_series_period(self): + # GH#13043 + ser = Series( + [Period("2015-01-01", freq="D"), Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" + + per = Period("2015-01-10", freq="D") + off = per.freq + # dtype will be object because of original dtype + expected = Series([9 * off, 8 * off], name="xxx", dtype=object) + tm.assert_series_equal(per - ser, expected) + tm.assert_series_equal(ser - per, -1 * expected) + + s2 = Series( + [Period("2015-01-05", freq="D"), Period("2015-01-04", freq="D")], + name="xxx", + ) + assert s2.dtype == "Period[D]" + + expected = Series([4 * off, 2 * off], name="xxx", dtype=object) + tm.assert_series_equal(s2 - ser, expected) + tm.assert_series_equal(ser - s2, -1 * expected) + + +class TestPeriodIndexSeriesMethods: + """Test PeriodIndex and Period Series Ops consistency""" + + def _check(self, values, func, expected): + idx = PeriodIndex(values) + result = func(idx) + tm.assert_equal(result, expected) + + ser = Series(values) + result = func(ser) + + exp = Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_ops(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + + expected = PeriodIndex( + ["2011-03", "2011-04", "2011-05", "2011-06"], freq="M", name="idx" + ) + + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + + result = idx - Period("2011-01", freq="M") + off = idx.freq + exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = Period("2011-01", freq="M") - idx + exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name="idx") + tm.assert_index_equal(result, exp) + + @pytest.mark.parametrize("ng", ["str", 1.5]) + @pytest.mark.parametrize( + "func", + [ + lambda obj, ng: obj + ng, + lambda obj, ng: ng + obj, + lambda obj, ng: obj - ng, + lambda obj, ng: ng - obj, + lambda obj, ng: np.add(obj, ng), + lambda obj, ng: np.add(ng, obj), + lambda obj, ng: np.subtract(obj, ng), + lambda obj, ng: np.subtract(ng, obj), + ], + ) + def test_parr_ops_errors(self, ng, func, box_with_array): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + obj = tm.box_expected(idx, box_with_array) + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate", + r"must be str", + "object to str implicitly", + ] + ) + + with pytest.raises(TypeError, match=msg): + func(obj, ng) + + def test_pi_ops_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + expected = PeriodIndex( + ["2011-03", "2011-04", "NaT", "2011-06"], freq="M", name="idx" + ) + + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) + + self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="2M", name="idx" + ) + expected = PeriodIndex( + ["2011-07", "2011-08", "NaT", "2011-10"], freq="2M", name="idx" + ) + + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) + + def test_pi_ops_array_int(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + f = lambda x: x + np.array([1, 2, 3, 4]) + exp = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) + exp = PeriodIndex( + ["2011-05", "2011-01", "NaT", "2011-06"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: x - np.array([1, 2, 3, 4]) + exp = PeriodIndex( + ["2010-12", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) + exp = PeriodIndex( + ["2010-10", "2010-12", "NaT", "2011-06"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) + f = lambda x: x + pd.offsets.Day() + exp = PeriodIndex( + ["2011-01-02", "2011-02-02", "2011-03-02", "2011-04-02"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + f = lambda x: x + pd.offsets.Day(2) + exp = PeriodIndex( + ["2011-01-03", "2011-02-03", "2011-03-03", "2011-04-03"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + f = lambda x: x - pd.offsets.Day(2) + exp = PeriodIndex( + ["2010-12-30", "2011-01-30", "2011-02-27", "2011-03-30"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) + ser = Series(idx) + + msg = ( + "Cannot add/subtract timedelta-like from PeriodArray that is not " + "an integer multiple of the PeriodArray's freq" + ) + for obj in [idx, ser]: + with pytest.raises(IncompatibleFrequency, match=msg): + obj + pd.offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + pd.offsets.Hour(2) + obj + + with pytest.raises(IncompatibleFrequency, match=msg): + obj - pd.offsets.Hour(2) + + def test_pi_sub_period(self): + # GH#13071 + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + + result = idx - Period("2012-01", freq="M") + off = idx.freq + exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = np.subtract(idx, Period("2012-01", freq="M")) + tm.assert_index_equal(result, exp) + + result = Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = np.subtract(Period("2012-01", freq="M"), idx) + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + result = idx - Period("NaT", freq="M") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + result = Period("NaT", freq="M") - idx + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_pi_sub_pdnat(self): + # GH#13071, GH#19389 + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + exp = TimedeltaIndex([pd.NaT] * 4, name="idx") + tm.assert_index_equal(pd.NaT - idx, exp) + tm.assert_index_equal(idx - pd.NaT, exp) + + def test_pi_sub_period_nat(self): + # GH#13071 + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) + + result = idx - Period("2012-01", freq="M") + off = idx.freq + exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name="idx") + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - Period("NaT", freq="M"), exp) + tm.assert_index_equal(Period("NaT", freq="M") - idx, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_timedelta64.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_timedelta64.py new file mode 100644 index 0000000000000000000000000000000000000000..d02e827d435cf16c806b5130f5949143f51c15e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arithmetic/test_timedelta64.py @@ -0,0 +1,2179 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas.errors import ( + OutOfBoundsDatetime, + PerformanceWarning, +) + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + NaT, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + offsets, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import NumpyExtensionArray +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) + + +def assert_dtype(obj, expected_dtype): + """ + Helper to check the dtype for a Series, Index, or single-column DataFrame. + """ + dtype = tm.get_dtype(obj) + + assert dtype == expected_dtype + + +def get_expected_name(box, names): + if box is DataFrame: + # Since we are operating with a DataFrame and a non-DataFrame, + # the non-DataFrame is cast to Series and its name ignored. + exname = names[0] + elif box in [tm.to_array, pd.array]: + exname = names[1] + else: + exname = names[2] + return exname + + +# ------------------------------------------------------------------ +# Timedelta64[ns] dtype Comparisons + + +class TestTimedelta64ArrayLikeComparisons: + # Comparison tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_timedelta64_zerodim(self, box_with_array): + # GH#26689 should unbox when comparing with zerodim array + box = box_with_array + xbox = box_with_array if box_with_array not in [Index, pd.array] else np.ndarray + + tdi = timedelta_range("2h", periods=4) + other = np.array(tdi.to_numpy()[0]) + + tdi = tm.box_expected(tdi, box) + res = tdi <= other + expected = np.array([True, False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) + + @pytest.mark.parametrize( + "td_scalar", + [ + timedelta(days=1), + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + offsets.Hour(24), + ], + ) + def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): + # regression test for GH#5963 + box = box_with_array + xbox = box if box not in [Index, pd.array] else np.ndarray + + ser = Series([timedelta(days=1), timedelta(days=2)]) + ser = tm.box_expected(ser, box) + actual = ser > td_scalar + expected = Series([False, True]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(actual, expected) + + @pytest.mark.parametrize( + "invalid", + [ + 345600000000000, + "a", + Timestamp("2021-01-01"), + Timestamp("2021-01-01").now("UTC"), + Timestamp("2021-01-01").now().to_datetime64(), + Timestamp("2021-01-01").now().to_pydatetime(), + Timestamp("2021-01-01").date(), + np.array(4), # zero-dim mismatched dtype + ], + ) + def test_td64_comparisons_invalid(self, box_with_array, invalid): + # GH#13624 for str + box = box_with_array + + rng = timedelta_range("1 days", periods=10) + obj = tm.box_expected(rng, box) + + assert_invalid_comparison(obj, invalid, box) + + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.date_range("1970-01-01", periods=10, tz="UTC").array, + np.array(pd.date_range("1970-01-01", periods=10)), + list(pd.date_range("1970-01-01", periods=10)), + pd.date_range("1970-01-01", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_td64arr_cmp_arraylike_invalid(self, other, box_with_array): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + + rng = timedelta_range("1 days", periods=10)._data + rng = tm.box_expected(rng, box_with_array) + assert_invalid_comparison(rng, other, box_with_array) + + def test_td64arr_cmp_mixed_invalid(self): + rng = timedelta_range("1 days", periods=5)._data + other = np.array([0, 1, 2, rng[3], Timestamp("2021-01-01")]) + + result = rng == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + rng < other + with pytest.raises(TypeError, match=msg): + rng > other + with pytest.raises(TypeError, match=msg): + rng <= other + with pytest.raises(TypeError, match=msg): + rng >= other + + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box + + @pytest.mark.parametrize("dtype", [None, object]) + def test_comp_nat(self, dtype): + left = TimedeltaIndex([Timedelta("1 days"), NaT, Timedelta("3 days")]) + right = TimedeltaIndex([NaT, NaT, Timedelta("3 days")]) + + lhs, rhs = left, right + if dtype is object: + lhs, rhs = left.astype(object), right.astype(object) + + result = rhs == lhs + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = rhs != lhs + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs == NaT, expected) + tm.assert_numpy_array_equal(NaT == rhs, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(lhs != NaT, expected) + tm.assert_numpy_array_equal(NaT != lhs, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs < NaT, expected) + tm.assert_numpy_array_equal(NaT > lhs, expected) + + @pytest.mark.parametrize( + "idx2", + [ + TimedeltaIndex( + ["2 day", "2 day", NaT, NaT, "1 day 00:00:02", "5 days 00:00:03"] + ), + np.array( + [ + np.timedelta64(2, "D"), + np.timedelta64(2, "D"), + np.timedelta64("nat"), + np.timedelta64("nat"), + np.timedelta64(1, "D") + np.timedelta64(2, "s"), + np.timedelta64(5, "D") + np.timedelta64(3, "s"), + ] + ), + ], + ) + def test_comparisons_nat(self, idx2): + idx1 = TimedeltaIndex( + [ + "1 day", + NaT, + "1 day 00:00:01", + NaT, + "1 day 00:00:01", + "5 day 00:00:03", + ] + ) + # Check pd.NaT is handles as the same as np.nan + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + # TODO: better name + def test_comparisons_coverage(self): + rng = timedelta_range("1 days", periods=10) + + result = rng < rng[3] + expected = np.array([True, True, True] + [False] * 7) + tm.assert_numpy_array_equal(result, expected) + + result = rng == list(rng) + exp = rng == rng + tm.assert_numpy_array_equal(result, exp) + + +# ------------------------------------------------------------------ +# Timedelta64[ns] dtype Arithmetic Operations + + +class TestTimedelta64ArithmeticUnsorted: + # Tests moved from type-specific test files but not + # yet sorted/parametrized/de-duplicated + + def test_ufunc_coercions(self): + # normal ops are also tested in tseries/test_timedeltas.py + idx = TimedeltaIndex(["2h", "4h", "6h", "8h", "10h"], freq="2h", name="x") + + for result in [idx * 2, np.multiply(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["4h", "8h", "12h", "16h", "20h"], freq="4h", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "4h" + + for result in [idx / 2, np.divide(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["1h", "2h", "3h", "4h", "5h"], freq="h", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "h" + + for result in [-idx, np.negative(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex( + ["-2h", "-4h", "-6h", "-8h", "-10h"], freq="-2h", name="x" + ) + tm.assert_index_equal(result, exp) + assert result.freq == "-2h" + + idx = TimedeltaIndex(["-2h", "-1h", "0h", "1h", "2h"], freq="h", name="x") + for result in [abs(idx), np.absolute(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["2h", "1h", "0h", "1h", "2h"], freq=None, name="x") + tm.assert_index_equal(result, exp) + assert result.freq is None + + def test_subtraction_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") + + msg = "cannot subtract a datelike from a TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi - dt + with pytest.raises(TypeError, match=msg): + tdi - dti + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + td - dt + + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + td - dti + + result = dt - dti + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"], name="bar") + tm.assert_index_equal(result, expected) + + result = dti - dt + expected = TimedeltaIndex(["0 days", "1 days", "2 days"], name="bar") + tm.assert_index_equal(result, expected) + + result = tdi - td + expected = TimedeltaIndex(["0 days", NaT, "1 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = td - tdi + expected = TimedeltaIndex(["0 days", NaT, "-1 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = dti - td + expected = DatetimeIndex( + ["20121231", "20130101", "20130102"], dtype="M8[ns]", freq="D", name="bar" + ) + tm.assert_index_equal(result, expected) + + result = dt - tdi + expected = DatetimeIndex( + ["20121231", NaT, "20121230"], dtype="M8[ns]", name="foo" + ) + tm.assert_index_equal(result, expected) + + def test_subtraction_ops_with_tz(self, box_with_array): + # check that dt/dti subtraction ops with tz are validated + dti = pd.date_range("20130101", periods=3) + dti = tm.box_expected(dti, box_with_array) + ts = Timestamp("20130101") + dt = ts.to_pydatetime() + dti_tz = pd.date_range("20130101", periods=3).tz_localize("US/Eastern") + dti_tz = tm.box_expected(dti_tz, box_with_array) + ts_tz = Timestamp("20130101").tz_localize("US/Eastern") + ts_tz2 = Timestamp("20130101").tz_localize("CET") + dt_tz = ts_tz.to_pydatetime() + td = Timedelta("1 days") + + def _check(result, expected): + assert result == expected + assert isinstance(result, Timedelta) + + # scalars + result = ts - ts + expected = Timedelta("0 days") + _check(result, expected) + + result = dt_tz - ts_tz + expected = Timedelta("0 days") + _check(result, expected) + + result = ts_tz - dt_tz + expected = Timedelta("0 days") + _check(result, expected) + + # tz mismatches + msg = "Cannot subtract tz-naive and tz-aware datetime-like objects." + with pytest.raises(TypeError, match=msg): + dt_tz - ts + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt_tz - dt + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt - dt_tz + msg = "Cannot subtract tz-naive and tz-aware datetime-like objects." + with pytest.raises(TypeError, match=msg): + ts - dt_tz + with pytest.raises(TypeError, match=msg): + ts_tz2 - ts + with pytest.raises(TypeError, match=msg): + ts_tz2 - dt + + msg = "Cannot subtract tz-naive and tz-aware" + # with dti + with pytest.raises(TypeError, match=msg): + dti - ts_tz + with pytest.raises(TypeError, match=msg): + dti_tz - ts + + result = dti_tz - dt_tz + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + result = dt_tz - dti_tz + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + result = dti_tz - ts_tz + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + result = ts_tz - dti_tz + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + result = td - td + expected = Timedelta("0 days") + _check(result, expected) + + result = dti_tz - td + expected = DatetimeIndex( + ["20121231", "20130101", "20130102"], tz="US/Eastern" + ).as_unit("ns") + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + def test_dti_tdi_numeric_ops(self): + # These are normally union/diff set-like ops + tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + + result = tdi - tdi + expected = TimedeltaIndex(["0 days", NaT, "0 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = tdi + tdi + expected = TimedeltaIndex(["2 days", NaT, "4 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = dti - tdi # name will be reset + expected = DatetimeIndex(["20121231", NaT, "20130101"], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + def test_addition_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") + + result = tdi + dt + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) + tm.assert_index_equal(result, expected) + + result = dt + tdi + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) + tm.assert_index_equal(result, expected) + + result = td + tdi + expected = TimedeltaIndex(["2 days", NaT, "3 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = tdi + td + expected = TimedeltaIndex(["2 days", NaT, "3 days"], name="foo") + tm.assert_index_equal(result, expected) + + # unequal length + msg = "cannot add indices of unequal length" + with pytest.raises(ValueError, match=msg): + tdi + dti[0:1] + with pytest.raises(ValueError, match=msg): + tdi[0:1] + dti + + # random indexes + msg = "Addition/subtraction of integers and integer-arrays" + with pytest.raises(TypeError, match=msg): + tdi + Index([1, 2, 3], dtype=np.int64) + + # this is a union! + # FIXME: don't leave commented-out + # pytest.raises(TypeError, lambda : Index([1,2,3]) + tdi) + + result = tdi + dti # name will be reset + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + result = dti + tdi # name will be reset + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + result = dt + td + expected = Timestamp("20130102") + assert result == expected + + result = td + dt + expected = Timestamp("20130102") + assert result == expected + + # TODO: Needs more informative name, probably split up into + # more targeted tests + @pytest.mark.parametrize("freq", ["D", "B"]) + def test_timedelta(self, freq): + index = pd.date_range("1/1/2000", periods=50, freq=freq) + + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + back = back._with_freq("infer") + tm.assert_index_equal(index, back) + + if freq == "D": + expected = pd.tseries.offsets.Day(1) + assert index.freq == expected + assert shifted.freq == expected + assert back.freq == expected + else: # freq == 'B' + assert index.freq == pd.tseries.offsets.BusinessDay(1) + assert shifted.freq is None + assert back.freq == pd.tseries.offsets.BusinessDay(1) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + def test_timedelta_tick_arithmetic(self): + # GH#4134, buggy with timedeltas + rng = pd.date_range("2013", "2014") + s = Series(rng) + result1 = rng - offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - offsets.Hour(1)) + + assert result1.freq == rng.freq + result1 = result1._with_freq(None) + tm.assert_index_equal(result1, result4) + + assert result3.freq == rng.freq + result3 = result3._with_freq(None) + tm.assert_index_equal(result2, result3) + + def test_tda_add_sub_index(self): + # Check that TimedeltaArray defers to Index on arithmetic ops + tdi = TimedeltaIndex(["1 days", NaT, "2 days"]) + tda = tdi.array + + dti = pd.date_range("1999-12-31", periods=3, freq="D") + + result = tda + dti + expected = tdi + dti + tm.assert_index_equal(result, expected) + + result = tda + tdi + expected = tdi + tdi + tm.assert_index_equal(result, expected) + + result = tda - tdi + expected = tdi - tdi + tm.assert_index_equal(result, expected) + + def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): + # Result should be cast back to DatetimeArray + box = box_with_array + + dti = pd.date_range("2016-01-01", periods=3, tz=tz_naive_fixture) + dti = dti._with_freq(None) + tdi = dti - dti + + obj = tm.box_expected(tdi, box) + other = tm.box_expected(dti, box) + + with tm.assert_produces_warning(PerformanceWarning): + result = obj + other.astype(object) + tm.assert_equal(result, other.astype(object)) + + # ------------------------------------------------------------- + # Binary operations TimedeltaIndex and timedelta-like + + def test_tdi_iadd_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as + is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + orig_rng = rng + rng += two_hours + tm.assert_equal(rng, expected) + if box_with_array is not Index: + # Check that operation is actually inplace + tm.assert_equal(orig_rng, expected) + + def test_tdi_isub_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as - is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + orig_rng = rng + rng -= two_hours + tm.assert_equal(rng, expected) + if box_with_array is not Index: + # Check that operation is actually inplace + tm.assert_equal(orig_rng, expected) + + # ------------------------------------------------------------- + + def test_tdi_ops_attributes(self): + rng = timedelta_range("2 days", periods=5, freq="2D", name="x") + + result = rng + 1 * rng.freq + exp = timedelta_range("4 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng - 2 * rng.freq + exp = timedelta_range("-2 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng * 2 + exp = timedelta_range("4 days", periods=5, freq="4D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "4D" + + result = rng / 2 + exp = timedelta_range("1 days", periods=5, freq="D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "D" + + result = -rng + exp = timedelta_range("-2 days", periods=5, freq="-2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "-2D" + + rng = timedelta_range("-2 days", periods=5, freq="D", name="x") + + result = abs(rng) + exp = TimedeltaIndex( + ["2 days", "1 days", "0 days", "1 days", "2 days"], name="x" + ) + tm.assert_index_equal(result, exp) + assert result.freq is None + + +class TestAddSubNaTMasking: + # TODO: parametrize over boxes + + @pytest.mark.parametrize("str_ts", ["1950-01-01", "1980-01-01"]) + def test_tdarr_add_timestamp_nat_masking(self, box_with_array, str_ts): + # GH#17991 checking for overflow-masking with NaT + tdinat = pd.to_timedelta(["24658 days 11:15:00", "NaT"]) + tdobj = tm.box_expected(tdinat, box_with_array) + + ts = Timestamp(str_ts) + ts_variants = [ + ts, + ts.to_pydatetime(), + ts.to_datetime64().astype("datetime64[ns]"), + ts.to_datetime64().astype("datetime64[D]"), + ] + + for variant in ts_variants: + res = tdobj + variant + if box_with_array is DataFrame: + assert res.iloc[1, 1] is NaT + else: + assert res[1] is NaT + + def test_tdi_add_overflow(self): + # See GH#14068 + # preliminary test scalar analogue of vectorized tests below + # TODO: Make raised error message more informative and test + with pytest.raises(OutOfBoundsDatetime, match="10155196800000000000"): + pd.to_timedelta(106580, "D") + Timestamp("2000") + with pytest.raises(OutOfBoundsDatetime, match="10155196800000000000"): + Timestamp("2000") + pd.to_timedelta(106580, "D") + + _NaT = NaT._value + 1 + msg = "Overflow in int64 addition" + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta([106580], "D") + Timestamp("2000") + with pytest.raises(OverflowError, match=msg): + Timestamp("2000") + pd.to_timedelta([106580], "D") + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta([_NaT]) - Timedelta("1 days") + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta(["5 days", _NaT]) - Timedelta("1 days") + with pytest.raises(OverflowError, match=msg): + ( + pd.to_timedelta([_NaT, "5 days", "1 hours"]) + - pd.to_timedelta(["7 seconds", _NaT, "4 hours"]) + ) + + # These should not overflow! + exp = TimedeltaIndex([NaT]) + result = pd.to_timedelta([NaT]) - Timedelta("1 days") + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(["4 days", NaT]) + result = pd.to_timedelta(["5 days", NaT]) - Timedelta("1 days") + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([NaT, NaT, "5 hours"]) + result = pd.to_timedelta([NaT, "5 days", "1 hours"]) + pd.to_timedelta( + ["7 seconds", NaT, "4 hours"] + ) + tm.assert_index_equal(result, exp) + + +class TestTimedeltaArraylikeAddSubOps: + # Tests for timedelta64[ns] __add__, __sub__, __radd__, __rsub__ + + def test_sub_nat_retain_unit(self): + ser = pd.to_timedelta(Series(["00:00:01"])).astype("m8[s]") + + result = ser - NaT + expected = Series([NaT], dtype="m8[s]") + tm.assert_series_equal(result, expected) + + # TODO: moved from tests.indexes.timedeltas.test_arithmetic; needs + # parametrization+de-duplication + def test_timedelta_ops_with_missing_values(self): + # setup + s1 = pd.to_timedelta(Series(["00:00:01"])) + s2 = pd.to_timedelta(Series(["00:00:02"])) + + sn = pd.to_timedelta(Series([NaT], dtype="m8[ns]")) + + df1 = DataFrame(["00:00:01"]).apply(pd.to_timedelta) + df2 = DataFrame(["00:00:02"]).apply(pd.to_timedelta) + + dfn = DataFrame([NaT._value]).apply(pd.to_timedelta) + + scalar1 = pd.to_timedelta("00:00:01") + scalar2 = pd.to_timedelta("00:00:02") + timedelta_NaT = pd.to_timedelta("NaT") + + actual = scalar1 + scalar1 + assert actual == scalar2 + actual = scalar2 - scalar1 + assert actual == scalar1 + + actual = s1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - s1 + tm.assert_series_equal(actual, s1) + + actual = s1 + scalar1 + tm.assert_series_equal(actual, s2) + actual = scalar1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - scalar1 + tm.assert_series_equal(actual, s1) + actual = -scalar1 + s2 + tm.assert_series_equal(actual, s1) + + actual = s1 + timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + actual = s1 - timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = -timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): + s1 + np.nan + with pytest.raises(TypeError, match=msg): + np.nan + s1 + with pytest.raises(TypeError, match=msg): + s1 - np.nan + with pytest.raises(TypeError, match=msg): + -np.nan + s1 + + actual = s1 + NaT + tm.assert_series_equal(actual, sn) + actual = s2 - NaT + tm.assert_series_equal(actual, sn) + + actual = s1 + df1 + tm.assert_frame_equal(actual, df2) + actual = s2 - df1 + tm.assert_frame_equal(actual, df1) + actual = df1 + s1 + tm.assert_frame_equal(actual, df2) + actual = df2 - s1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + df1 + tm.assert_frame_equal(actual, df2) + actual = df2 - df1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + scalar1 + tm.assert_frame_equal(actual, df2) + actual = df2 - scalar1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + timedelta_NaT + tm.assert_frame_equal(actual, dfn) + actual = df1 - timedelta_NaT + tm.assert_frame_equal(actual, dfn) + + msg = "cannot subtract a datelike from|unsupported operand type" + with pytest.raises(TypeError, match=msg): + df1 + np.nan + with pytest.raises(TypeError, match=msg): + df1 - np.nan + + actual = df1 + NaT # NaT is datetime, not timedelta + tm.assert_frame_equal(actual, dfn) + actual = df1 - NaT + tm.assert_frame_equal(actual, dfn) + + # TODO: moved from tests.series.test_operators, needs splitting, cleanup, + # de-duplication, box-parametrization... + def test_operators_timedelta64(self): + # series ops + v1 = pd.date_range("2012-1-1", periods=3, freq="D") + v2 = pd.date_range("2012-1-2", periods=3, freq="D") + rs = Series(v2) - Series(v1) + xp = Series(1e9 * 3600 * 24, rs.index).astype("int64").astype("timedelta64[ns]") + tm.assert_series_equal(rs, xp) + assert rs.dtype == "timedelta64[ns]" + + df = DataFrame({"A": v1}) + td = Series([timedelta(days=i) for i in range(3)]) + assert td.dtype == "timedelta64[ns]" + + # series on the rhs + result = df["A"] - df["A"].shift() + assert result.dtype == "timedelta64[ns]" + + result = df["A"] + td + assert result.dtype == "M8[ns]" + + # scalar Timestamp on rhs + maxa = df["A"].max() + assert isinstance(maxa, Timestamp) + + resultb = df["A"] - df["A"].max() + assert resultb.dtype == "timedelta64[ns]" + + # timestamp on lhs + result = resultb + df["A"] + values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + expected = Series(values, dtype="M8[ns]", name="A") + tm.assert_series_equal(result, expected) + + # datetimes on rhs + result = df["A"] - datetime(2001, 1, 1) + expected = Series([timedelta(days=4017 + i) for i in range(3)], name="A") + tm.assert_series_equal(result, expected) + assert result.dtype == "m8[ns]" + + d = datetime(2001, 1, 1, 3, 4) + resulta = df["A"] - d + assert resulta.dtype == "m8[ns]" + + # roundtrip + resultb = resulta + d + tm.assert_series_equal(df["A"], resultb) + + # timedeltas on rhs + td = timedelta(days=1) + resulta = df["A"] + td + resultb = resulta - td + tm.assert_series_equal(resultb, df["A"]) + assert resultb.dtype == "M8[ns]" + + # roundtrip + td = timedelta(minutes=5, seconds=3) + resulta = df["A"] + td + resultb = resulta - td + tm.assert_series_equal(df["A"], resultb) + assert resultb.dtype == "M8[ns]" + + # inplace + value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1)) + rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) + assert rs[2] == value + + def test_timedelta64_ops_nat(self): + # GH 11349 + timedelta_series = Series([NaT, Timedelta("1s")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") + + # subtraction + tm.assert_series_equal(timedelta_series - NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(-NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series - single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + # addition + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal(timedelta_series + NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series + single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + # multiplication + tm.assert_series_equal( + nat_series_dtype_timedelta * 1.0, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + 1.0 * nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal(timedelta_series * 1, timedelta_series) + tm.assert_series_equal(1 * timedelta_series, timedelta_series) + + tm.assert_series_equal(timedelta_series * 1.5, Series([NaT, Timedelta("1.5s")])) + tm.assert_series_equal(1.5 * timedelta_series, Series([NaT, Timedelta("1.5s")])) + + tm.assert_series_equal(timedelta_series * np.nan, nat_series_dtype_timedelta) + tm.assert_series_equal(np.nan * timedelta_series, nat_series_dtype_timedelta) + + # division + tm.assert_series_equal(timedelta_series / 2, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta) + + # ------------------------------------------------------------- + # Binary operations td64 arraylike and datetime-like + + @pytest.mark.parametrize("cls", [Timestamp, datetime, np.datetime64]) + def test_td64arr_add_sub_datetimelike_scalar( + self, cls, box_with_array, tz_naive_fixture + ): + # GH#11925, GH#29558, GH#23215 + tz = tz_naive_fixture + + dt_scalar = Timestamp("2012-01-01", tz=tz) + if cls is datetime: + ts = dt_scalar.to_pydatetime() + elif cls is np.datetime64: + if tz_naive_fixture is not None: + pytest.skip(f"{cls} doesn support {tz_naive_fixture}") + ts = dt_scalar.to_datetime64() + else: + ts = dt_scalar + + tdi = timedelta_range("1 day", periods=3) + expected = pd.date_range("2012-01-02", periods=3, tz=tz) + + tdarr = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + tm.assert_equal(ts + tdarr, expected) + tm.assert_equal(tdarr + ts, expected) + + expected2 = pd.date_range("2011-12-31", periods=3, freq="-1D", tz=tz) + expected2 = tm.box_expected(expected2, box_with_array) + + tm.assert_equal(ts - tdarr, expected2) + tm.assert_equal(ts + (-tdarr), expected2) + + msg = "cannot subtract a datelike" + with pytest.raises(TypeError, match=msg): + tdarr - ts + + def test_td64arr_add_datetime64_nat(self, box_with_array): + # GH#23215 + other = np.datetime64("NaT") + + tdi = timedelta_range("1 day", periods=3) + expected = DatetimeIndex(["NaT", "NaT", "NaT"], dtype="M8[ns]") + + tdser = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + tm.assert_equal(tdser + other, expected) + tm.assert_equal(other + tdser, expected) + + def test_td64arr_sub_dt64_array(self, box_with_array): + dti = pd.date_range("2016-01-01", periods=3) + tdi = TimedeltaIndex(["-1 Day"] * 3) + dtarr = dti.values + expected = DatetimeIndex(dtarr) - tdi + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + msg = "cannot subtract a datelike from" + with pytest.raises(TypeError, match=msg): + tdi - dtarr + + # TimedeltaIndex.__rsub__ + result = dtarr - tdi + tm.assert_equal(result, expected) + + def test_td64arr_add_dt64_array(self, box_with_array): + dti = pd.date_range("2016-01-01", periods=3) + tdi = TimedeltaIndex(["-1 Day"] * 3) + dtarr = dti.values + expected = DatetimeIndex(dtarr) + tdi + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdi + dtarr + tm.assert_equal(result, expected) + result = dtarr + tdi + tm.assert_equal(result, expected) + + # ------------------------------------------------------------------ + # Invalid __add__/__sub__ operations + + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) + def test_td64arr_sub_periodlike( + self, box_with_array, box_with_array2, tdi_freq, pi_freq + ): + # GH#20049 subtracting PeriodIndex should raise TypeError + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi + pi = dti.to_period(pi_freq) + per = pi[0] + + tdi = tm.box_expected(tdi, box_with_array) + pi = tm.box_expected(pi, box_with_array2) + msg = "cannot subtract|unsupported operand type" + with pytest.raises(TypeError, match=msg): + tdi - pi + + # GH#13078 subtraction of Period scalar not supported + with pytest.raises(TypeError, match=msg): + tdi - per + + @pytest.mark.parametrize( + "other", + [ + # GH#12624 for str case + "a", + # GH#19123 + 1, + 1.5, + np.array(2), + ], + ) + def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other): + # vector-like others are tested in test_td64arr_add_sub_numeric_arr_invalid + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdarr = tm.box_expected(tdser, box_with_array) + + assert_invalid_addsub_type(tdarr, other) + + @pytest.mark.parametrize( + "vec", + [ + np.array([1, 2, 3]), + Index([1, 2, 3]), + Series([1, 2, 3]), + DataFrame([[1, 2, 3]]), + ], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_addsub_numeric_arr_invalid( + self, box_with_array, vec, any_real_numpy_dtype + ): + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdarr = tm.box_expected(tdser, box_with_array) + + vector = vec.astype(any_real_numpy_dtype) + assert_invalid_addsub_type(tdarr, vector) + + def test_td64arr_add_sub_int(self, box_with_array, one): + # Variants of `one` for #19012, deprecated GH#22535 + rng = timedelta_range("1 days 09:00:00", freq="h", periods=10) + tdarr = tm.box_expected(rng, box_with_array) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, one, msg) + + # TODO: get inplace ops into assert_invalid_addsub_type + with pytest.raises(TypeError, match=msg): + tdarr += one + with pytest.raises(TypeError, match=msg): + tdarr -= one + + def test_td64arr_add_sub_integer_array(self, box_with_array): + # GH#19959, deprecated GH#22535 + # GH#22696 for DataFrame case, check that we don't dispatch to numpy + # implementation, which treats int64 as m8[ns] + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + rng = timedelta_range("1 days 09:00:00", freq="h", periods=3) + tdarr = tm.box_expected(rng, box) + other = tm.box_expected([4, 3, 2], xbox) + + msg = "Addition/subtraction of integers and integer-arrays" + assert_invalid_addsub_type(tdarr, other, msg) + + def test_td64arr_addsub_integer_array_no_freq(self, box_with_array): + # GH#19959 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) + tdarr = tm.box_expected(tdi, box) + other = tm.box_expected([14, -1, 16], xbox) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, other, msg) + + # ------------------------------------------------------------------ + # Operations with timedelta-like others + + def test_td64arr_add_sub_td64_array(self, box_with_array): + box = box_with_array + dti = pd.date_range("2016-01-01", periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = 2 * tdi + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + result = tdi + tdarr + tm.assert_equal(result, expected) + result = tdarr + tdi + tm.assert_equal(result, expected) + + expected_sub = 0 * tdi + result = tdi - tdarr + tm.assert_equal(result, expected_sub) + result = tdarr - tdi + tm.assert_equal(result, expected_sub) + + def test_td64arr_add_sub_tdi(self, box_with_array, names): + # GH#17250 make sure result dtype is correct + # GH#19043 make sure names are propagated correctly + box = box_with_array + exname = get_expected_name(box, names) + + tdi = TimedeltaIndex(["0 days", "1 day"], name=names[1]) + tdi = np.array(tdi) if box in [tm.to_array, pd.array] else tdi + ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[0]) + expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], name=exname) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = tdi + ser + tm.assert_equal(result, expected) + assert_dtype(result, "timedelta64[ns]") + + result = ser + tdi + tm.assert_equal(result, expected) + assert_dtype(result, "timedelta64[ns]") + + expected = Series( + [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=exname + ) + expected = tm.box_expected(expected, box) + + result = tdi - ser + tm.assert_equal(result, expected) + assert_dtype(result, "timedelta64[ns]") + + result = ser - tdi + tm.assert_equal(result, -expected) + assert_dtype(result, "timedelta64[ns]") + + @pytest.mark.parametrize("tdnat", [np.timedelta64("NaT"), NaT]) + def test_td64arr_add_sub_td64_nat(self, box_with_array, tdnat): + # GH#18808, GH#23320 special handling for timedelta64("NaT") + box = box_with_array + tdi = TimedeltaIndex([NaT, Timedelta("1s")]) + expected = TimedeltaIndex(["NaT"] * 2) + + obj = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + result = obj + tdnat + tm.assert_equal(result, expected) + result = tdnat + obj + tm.assert_equal(result, expected) + result = obj - tdnat + tm.assert_equal(result, expected) + result = tdnat - obj + tm.assert_equal(result, expected) + + def test_td64arr_add_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as + is now numeric + # GH#10699 for Tick cases + box = box_with_array + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, box) + + result = rng + two_hours + tm.assert_equal(result, expected) + + result = two_hours + rng + tm.assert_equal(result, expected) + + def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as - is now numeric + # GH#10699 for Tick cases + box = box_with_array + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") + + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, box) + + result = rng - two_hours + tm.assert_equal(result, expected) + + result = two_hours - rng + tm.assert_equal(result, -expected) + + # ------------------------------------------------------------------ + # __add__/__sub__ with DateOffsets and arrays of DateOffsets + + def test_td64arr_add_sub_offset_index(self, names, box_with_array): + # GH#18849, GH#19744 + box = box_with_array + exname = get_expected_name(box, names) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = Index([offsets.Hour(n=1), offsets.Minute(n=-2)], name=names[1]) + other = np.array(other) if box in [tm.to_array, pd.array] else other + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=exname + ) + expected_sub = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=exname + ) + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box).astype(object, copy=False) + expected_sub = tm.box_expected(expected_sub, box).astype(object, copy=False) + + with tm.assert_produces_warning(PerformanceWarning): + res = tdi + other + tm.assert_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + tdi + tm.assert_equal(res2, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res_sub = tdi - other + tm.assert_equal(res_sub, expected_sub) + + def test_td64arr_add_sub_offset_array(self, box_with_array): + # GH#18849, GH#18824 + box = box_with_array + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) + other = np.array([offsets.Hour(n=1), offsets.Minute(n=-2)]) + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer" + ) + expected_sub = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer" + ) + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box).astype(object) + + with tm.assert_produces_warning(PerformanceWarning): + res = tdi + other + tm.assert_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + tdi + tm.assert_equal(res2, expected) + + expected_sub = tm.box_expected(expected_sub, box_with_array).astype(object) + with tm.assert_produces_warning(PerformanceWarning): + res_sub = tdi - other + tm.assert_equal(res_sub, expected_sub) + + def test_td64arr_with_offset_series(self, names, box_with_array): + # GH#18849 + box = box_with_array + box2 = Series if box in [Index, tm.to_array, pd.array] else box + exname = get_expected_name(box, names) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = Series([offsets.Hour(n=1), offsets.Minute(n=-2)], name=names[1]) + + expected_add = Series( + [tdi[n] + other[n] for n in range(len(tdi))], name=exname, dtype=object + ) + obj = tm.box_expected(tdi, box) + expected_add = tm.box_expected(expected_add, box2).astype(object) + + with tm.assert_produces_warning(PerformanceWarning): + res = obj + other + tm.assert_equal(res, expected_add) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + obj + tm.assert_equal(res2, expected_add) + + expected_sub = Series( + [tdi[n] - other[n] for n in range(len(tdi))], name=exname, dtype=object + ) + expected_sub = tm.box_expected(expected_sub, box2).astype(object) + + with tm.assert_produces_warning(PerformanceWarning): + res3 = obj - other + tm.assert_equal(res3, expected_sub) + + @pytest.mark.parametrize("obox", [np.array, Index, Series]) + def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): + # GH#18824 + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) + tdi = tm.box_expected(tdi, box_with_array) + + anchored = obox([offsets.MonthEnd(), offsets.Day(n=2)]) + + # addition/subtraction ops with anchored offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + msg = "has incorrect type|cannot add the type MonthEnd" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + tdi + anchored + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + anchored + tdi + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + tdi - anchored + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + anchored - tdi + + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + tdi = timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box) + + other = np.array([Timedelta(days=1), offsets.Day(2), Timestamp("2000-01-04")]) + + with tm.assert_produces_warning(PerformanceWarning): + result = tdarr + other + + expected = Index( + [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, xbox).astype(object) + tm.assert_equal(result, expected) + + msg = "unsupported operand type|cannot subtract a datelike" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(PerformanceWarning): + tdarr - other + + with tm.assert_produces_warning(PerformanceWarning): + result = other - tdarr + + expected = Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) + expected = tm.box_expected(expected, xbox).astype(object) + tm.assert_equal(result, expected) + + +class TestTimedeltaArraylikeMulDivOps: + # Tests for timedelta64[ns] + # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ + + # ------------------------------------------------------------------ + # Multiplication + # organized with scalar others first, then array-like + + def test_td64arr_mul_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + + result = idx * 1 + tm.assert_equal(result, idx) + + result = 1 * idx + tm.assert_equal(result, idx) + + def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): + rng = timedelta_range("1 days", "10 days", name="foo") + rng = tm.box_expected(rng, box_with_array) + msg = "|".join( + [ + "argument must be an integer", + "cannot use operands with types dtype", + "Cannot multiply with", + ] + ) + with pytest.raises(TypeError, match=msg): + rng * two_hours + + def test_tdi_mul_int_array_zerodim(self, box_with_array): + rng5 = np.arange(5, dtype="int64") + idx = TimedeltaIndex(rng5) + expected = TimedeltaIndex(rng5 * 5) + + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = idx * np.array(5, dtype="int64") + tm.assert_equal(result, expected) + + def test_tdi_mul_int_array(self, box_with_array): + rng5 = np.arange(5, dtype="int64") + idx = TimedeltaIndex(rng5) + expected = TimedeltaIndex(rng5**2) + + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = idx * rng5 + tm.assert_equal(result, expected) + + def test_tdi_mul_int_series(self, box_with_array): + box = box_with_array + xbox = Series if box in [Index, tm.to_array, pd.array] else box + + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, xbox) + + result = idx * Series(np.arange(5, dtype="int64")) + tm.assert_equal(result, expected) + + def test_tdi_mul_float_series(self, box_with_array): + box = box_with_array + xbox = Series if box in [Index, tm.to_array, pd.array] else box + + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box) + + rng5f = np.arange(5, dtype="float64") + expected = TimedeltaIndex(rng5f * (rng5f + 1.0)) + expected = tm.box_expected(expected, xbox) + + result = idx * Series(rng5f + 1.0) + tm.assert_equal(result, expected) + + # TODO: Put Series/DataFrame in others? + @pytest.mark.parametrize( + "other", + [ + np.arange(1, 11), + Index(np.arange(1, 11), np.int64), + Index(range(1, 11), np.uint64), + Index(range(1, 11), np.float64), + pd.RangeIndex(1, 11), + ], + ids=lambda x: type(x).__name__, + ) + def test_tdi_rmul_arraylike(self, other, box_with_array): + box = box_with_array + + tdi = TimedeltaIndex(["1 Day"] * 10) + expected = timedelta_range("1 days", "10 days")._with_freq(None) + + tdi = tm.box_expected(tdi, box) + xbox = get_upcast_box(tdi, other) + + expected = tm.box_expected(expected, xbox) + + result = other * tdi + tm.assert_equal(result, expected) + commute = tdi * other + tm.assert_equal(commute, expected) + + # ------------------------------------------------------------------ + # __div__, __rdiv__ + + def test_td64arr_div_nat_invalid(self, box_with_array): + # don't allow division by NaT (maybe could in the future) + rng = timedelta_range("1 days", "10 days", name="foo") + rng = tm.box_expected(rng, box_with_array) + + with pytest.raises(TypeError, match="unsupported operand type"): + rng / NaT + with pytest.raises(TypeError, match="Cannot divide NaTType by"): + NaT / rng + + dt64nat = np.datetime64("NaT", "ns") + msg = "|".join( + [ + # 'divide' on npdev as of 2021-12-18 + "ufunc '(true_divide|divide)' cannot use operands", + "cannot perform __r?truediv__", + "Cannot divide datetime64 by TimedeltaArray", + ] + ) + with pytest.raises(TypeError, match=msg): + rng / dt64nat + with pytest.raises(TypeError, match=msg): + dt64nat / rng + + def test_td64arr_div_td64nat(self, box_with_array): + # GH#23829 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + rng = timedelta_range("1 days", "10 days") + rng = tm.box_expected(rng, box) + + other = np.timedelta64("NaT") + + expected = np.array([np.nan] * 10) + expected = tm.box_expected(expected, xbox) + + result = rng / other + tm.assert_equal(result, expected) + + result = other / rng + tm.assert_equal(result, expected) + + def test_td64arr_div_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + + result = idx / 1 + tm.assert_equal(result, idx) + + with pytest.raises(TypeError, match="Cannot divide"): + # GH#23829 + 1 / idx + + def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): + # GH#20088, GH#22163 ensure DataFrame returns correct dtype + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + rng = timedelta_range("1 days", "10 days", name="foo") + expected = Index((np.arange(10) + 1) * 12, dtype=np.float64, name="foo") + + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) + + result = rng / two_hours + tm.assert_equal(result, expected) + + result = two_hours / rng + expected = 1 / expected + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("m", [1, 3, 10]) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) + def test_td64arr_div_td64_scalar(self, m, unit, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + ser = Series([Timedelta(days=59)] * 3) + ser[2] = np.nan + flat = ser + ser = tm.box_expected(ser, box) + + # op + expected = Series([x / np.timedelta64(m, unit) for x in flat]) + expected = tm.box_expected(expected, xbox) + result = ser / np.timedelta64(m, unit) + tm.assert_equal(result, expected) + + # reverse op + expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in flat]) + expected = tm.box_expected(expected, xbox) + result = np.timedelta64(m, unit) / ser + tm.assert_equal(result, expected) + + def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + rng = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo") + expected = Index([12, np.nan, 24], dtype=np.float64, name="foo") + + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) + + result = rng / two_hours + tm.assert_equal(result, expected) + + result = two_hours / rng + expected = 1 / expected + tm.assert_equal(result, expected) + + def test_td64arr_div_td64_ndarray(self, box_with_array): + # GH#22631 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + rng = TimedeltaIndex(["1 days", NaT, "2 days"]) + expected = Index([12, np.nan, 24], dtype=np.float64) + + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) + + other = np.array([2, 4, 2], dtype="m8[h]") + result = rng / other + tm.assert_equal(result, expected) + + result = rng / tm.box_expected(other, box) + tm.assert_equal(result, expected) + + result = rng / other.astype(object) + tm.assert_equal(result, expected.astype(object)) + + result = rng / list(other) + tm.assert_equal(result, expected) + + # reversed op + expected = 1 / expected + result = other / rng + tm.assert_equal(result, expected) + + result = tm.box_expected(other, box) / rng + tm.assert_equal(result, expected) + + result = other.astype(object) / rng + tm.assert_equal(result, expected) + + result = list(other) / rng + tm.assert_equal(result, expected) + + def test_tdarr_div_length_mismatch(self, box_with_array): + rng = TimedeltaIndex(["1 days", NaT, "2 days"]) + mismatched = [1, 2, 3, 4] + + rng = tm.box_expected(rng, box_with_array) + msg = "Cannot divide vectors|Unable to coerce to Series" + for obj in [mismatched, mismatched[:2]]: + # one shorter, one longer + for other in [obj, np.array(obj), Index(obj)]: + with pytest.raises(ValueError, match=msg): + rng / other + with pytest.raises(ValueError, match=msg): + other / rng + + def test_td64_div_object_mixed_result(self, box_with_array): + # Case where we having a NaT in the result inseat of timedelta64("NaT") + # is misleading + orig = timedelta_range("1 Day", periods=3).insert(1, NaT) + tdi = tm.box_expected(orig, box_with_array, transpose=False) + + other = np.array([orig[0], 1.5, 2.0, orig[2]], dtype=object) + other = tm.box_expected(other, box_with_array, transpose=False) + + res = tdi / other + + expected = Index([1.0, np.timedelta64("NaT", "ns"), orig[0], 1.5], dtype=object) + expected = tm.box_expected(expected, box_with_array, transpose=False) + if isinstance(expected, NumpyExtensionArray): + expected = expected.to_numpy() + tm.assert_equal(res, expected) + if box_with_array is DataFrame: + # We have a np.timedelta64(NaT), not pd.NaT + assert isinstance(res.iloc[1, 0], np.timedelta64) + + res = tdi // other + + expected = Index([1, np.timedelta64("NaT", "ns"), orig[0], 1], dtype=object) + expected = tm.box_expected(expected, box_with_array, transpose=False) + if isinstance(expected, NumpyExtensionArray): + expected = expected.to_numpy() + tm.assert_equal(res, expected) + if box_with_array is DataFrame: + # We have a np.timedelta64(NaT), not pd.NaT + assert isinstance(res.iloc[1, 0], np.timedelta64) + + # ------------------------------------------------------------------ + # __floordiv__, __rfloordiv__ + + def test_td64arr_floordiv_td64arr_with_nat( + self, box_with_array, using_array_manager + ): + # GH#35529 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + + left = Series([1000, 222330, 30], dtype="timedelta64[ns]") + right = Series([1000, 222330, None], dtype="timedelta64[ns]") + + left = tm.box_expected(left, box) + right = tm.box_expected(right, box) + + expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) + expected = tm.box_expected(expected, xbox) + if box is DataFrame and using_array_manager: + # INFO(ArrayManager) floordiv returns integer, and ArrayManager + # performs ops column-wise and thus preserves int64 dtype for + # columns without missing values + expected[[0, 1]] = expected[[0, 1]].astype("int64") + + with tm.maybe_produces_warning( + RuntimeWarning, box is pd.array, check_stacklevel=False + ): + result = left // right + + tm.assert_equal(result, expected) + + # case that goes through __rfloordiv__ with arraylike + with tm.maybe_produces_warning( + RuntimeWarning, box is pd.array, check_stacklevel=False + ): + result = np.asarray(left) // right + tm.assert_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning") + def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): + # GH#18831, GH#19125 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + td = Timedelta("5m3s") # i.e. (scalar_td - 1sec) / 2 + + td1 = Series([td, td, NaT], dtype="m8[ns]") + td1 = tm.box_expected(td1, box, transpose=False) + + expected = Series([0, 0, np.nan]) + expected = tm.box_expected(expected, xbox, transpose=False) + + result = td1 // scalar_td + tm.assert_equal(result, expected) + + # Reversed op + expected = Series([2, 2, np.nan]) + expected = tm.box_expected(expected, xbox, transpose=False) + + result = scalar_td // td1 + tm.assert_equal(result, expected) + + # same thing buts let's be explicit about calling __rfloordiv__ + result = td1.__rfloordiv__(scalar_td) + tm.assert_equal(result, expected) + + def test_td64arr_floordiv_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + result = idx // 1 + tm.assert_equal(result, idx) + + pattern = "floor_divide cannot use operands|Cannot divide int by Timedelta*" + with pytest.raises(TypeError, match=pattern): + 1 // idx + + # ------------------------------------------------------------------ + # mod, divmod + # TODO: operations with timedelta-like arrays, numeric arrays, + # reversed ops + + def test_td64arr_mod_tdscalar(self, box_with_array, three_days): + tdi = timedelta_range("1 Day", "9 days") + tdarr = tm.box_expected(tdi, box_with_array) + + expected = TimedeltaIndex(["1 Day", "2 Days", "0 Days"] * 3) + expected = tm.box_expected(expected, box_with_array) + + result = tdarr % three_days + tm.assert_equal(result, expected) + + warn = None + if box_with_array is DataFrame and isinstance(three_days, pd.DateOffset): + warn = PerformanceWarning + # TODO: making expected be object here a result of DataFrame.__divmod__ + # being defined in a naive way that does not dispatch to the underlying + # array's __divmod__ + expected = expected.astype(object) + + with tm.assert_produces_warning(warn): + result = divmod(tdarr, three_days) + + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], tdarr // three_days) + + def test_td64arr_mod_int(self, box_with_array): + tdi = timedelta_range("1 ns", "10 ns", periods=10) + tdarr = tm.box_expected(tdi, box_with_array) + + expected = TimedeltaIndex(["1 ns", "0 ns"] * 5) + expected = tm.box_expected(expected, box_with_array) + + result = tdarr % 2 + tm.assert_equal(result, expected) + + msg = "Cannot divide int by" + with pytest.raises(TypeError, match=msg): + 2 % tdarr + + result = divmod(tdarr, 2) + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], tdarr // 2) + + def test_td64arr_rmod_tdscalar(self, box_with_array, three_days): + tdi = timedelta_range("1 Day", "9 days") + tdarr = tm.box_expected(tdi, box_with_array) + + expected = ["0 Days", "1 Day", "0 Days"] + ["3 Days"] * 6 + expected = TimedeltaIndex(expected) + expected = tm.box_expected(expected, box_with_array) + + result = three_days % tdarr + tm.assert_equal(result, expected) + + result = divmod(three_days, tdarr) + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], three_days // tdarr) + + # ------------------------------------------------------------------ + # Operations with invalid others + + def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + td1 = tm.box_expected(td1, box_with_array) + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = "operate|unsupported|cannot|not supported" + with pytest.raises(TypeError, match=pattern): + td1 * scalar_td + with pytest.raises(TypeError, match=pattern): + scalar_td * td1 + + def test_td64arr_mul_too_short_raises(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + msg = "|".join( + [ + "cannot use operands with types dtype", + "Cannot multiply with unequal lengths", + "Unable to coerce to Series", + ] + ) + with pytest.raises(TypeError, match=msg): + # length check before dtype check + idx * idx[:3] + with pytest.raises(ValueError, match=msg): + idx * np.array([1, 2]) + + def test_td64arr_mul_td64arr_raises(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): + idx * idx + + # ------------------------------------------------------------------ + # Operations with numeric others + + def test_td64arr_mul_numeric_scalar(self, box_with_array, one): + # GH#4521 + # divide/multiply by integers + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdser * (-one) + tm.assert_equal(result, expected) + result = (-one) * tdser + tm.assert_equal(result, expected) + + expected = Series(["118 Days", "118 Days", "NaT"], dtype="timedelta64[ns]") + expected = tm.box_expected(expected, box_with_array) + + result = tdser * (2 * one) + tm.assert_equal(result, expected) + result = (2 * one) * tdser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)]) + def test_td64arr_div_numeric_scalar(self, box_with_array, two): + # GH#4521 + # divide/multiply by integers + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdser / two + tm.assert_equal(result, expected) + + with pytest.raises(TypeError, match="Cannot divide"): + two / tdser + + @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)]) + def test_td64arr_floordiv_numeric_scalar(self, box_with_array, two): + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdser // two + tm.assert_equal(result, expected) + + with pytest.raises(TypeError, match="Cannot divide"): + two // tdser + + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_rmul_numeric_array( + self, + box_with_array, + vector, + any_real_numpy_dtype, + ): + # GH#4521 + # divide/multiply by integers + + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + vector = vector.astype(any_real_numpy_dtype) + + expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + xbox = get_upcast_box(tdser, vector) + + expected = tm.box_expected(expected, xbox) + + result = tdser * vector + tm.assert_equal(result, expected) + + result = vector * tdser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_div_numeric_array( + self, box_with_array, vector, any_real_numpy_dtype + ): + # GH#4521 + # divide/multiply by integers + + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + vector = vector.astype(any_real_numpy_dtype) + + expected = Series(["2.95D", "1D 23h 12m", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + xbox = get_upcast_box(tdser, vector) + expected = tm.box_expected(expected, xbox) + + result = tdser / vector + tm.assert_equal(result, expected) + + pattern = "|".join( + [ + "true_divide'? cannot use operands", + "cannot perform __div__", + "cannot perform __truediv__", + "unsupported operand", + "Cannot divide", + "ufunc 'divide' cannot use operands with types", + ] + ) + with pytest.raises(TypeError, match=pattern): + vector / tdser + + result = tdser / vector.astype(object) + if box_with_array is DataFrame: + expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] + expected = tm.box_expected(expected, xbox).astype(object) + # We specifically expect timedelta64("NaT") here, not pd.NA + msg = "The 'downcast' keyword in fillna" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected[2] = expected[2].fillna( + np.timedelta64("NaT", "ns"), downcast=False + ) + else: + expected = [tdser[n] / vector[n] for n in range(len(tdser))] + expected = [ + x if x is not NaT else np.timedelta64("NaT", "ns") for x in expected + ] + if xbox is tm.to_array: + expected = tm.to_array(expected).astype(object) + else: + expected = xbox(expected, dtype=object) + + tm.assert_equal(result, expected) + + with pytest.raises(TypeError, match=pattern): + vector.astype(object) / tdser + + def test_td64arr_mul_int_series(self, box_with_array, names): + # GH#19042 test for correct name attachment + box = box_with_array + exname = get_expected_name(box, names) + + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) + # TODO: Should we be parametrizing over types for `ser` too? + ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) + + expected = Series( + ["0days", "1day", "4days", "9days", "16days"], + dtype="timedelta64[ns]", + name=exname, + ) + + tdi = tm.box_expected(tdi, box) + xbox = get_upcast_box(tdi, ser) + + expected = tm.box_expected(expected, xbox) + + result = ser * tdi + tm.assert_equal(result, expected) + + result = tdi * ser + tm.assert_equal(result, expected) + + # TODO: Should we be parametrizing over types for `ser` too? + def test_float_series_rdiv_td64arr(self, box_with_array, names): + # GH#19042 test for correct name attachment + box = box_with_array + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) + ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) + + xname = names[2] if box not in [tm.to_array, pd.array] else names[1] + expected = Series( + [tdi[n] / ser[n] for n in range(len(ser))], + dtype="timedelta64[ns]", + name=xname, + ) + + tdi = tm.box_expected(tdi, box) + xbox = get_upcast_box(tdi, ser) + expected = tm.box_expected(expected, xbox) + + result = ser.__rtruediv__(tdi) + if box is DataFrame: + assert result is NotImplemented + else: + tm.assert_equal(result, expected) + + def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): + # GH#39750 make sure we infer the result as td64 + tdi = TimedeltaIndex([NaT, NaT]) + + left = tm.box_expected(tdi, box_with_array) + right = np.array([2, 2.0], dtype=object) + + tdnat = np.timedelta64("NaT", "ns") + expected = Index([tdnat] * 2, dtype=object) + if box_with_array is not Index: + expected = tm.box_expected(expected, box_with_array).astype(object) + if box_with_array in [Series, DataFrame]: + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = expected.fillna(tdnat, downcast=False) # GH#18463 + + result = left / right + tm.assert_equal(result, expected) + + result = left // right + tm.assert_equal(result, expected) + + +class TestTimedelta64ArrayLikeArithmetic: + # Arithmetic tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all arithmetic + # tests will eventually end up here. + + def test_td64arr_pow_invalid(self, scalar_td, box_with_array): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + td1 = tm.box_expected(td1, box_with_array) + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = "operate|unsupported|cannot|not supported" + with pytest.raises(TypeError, match=pattern): + scalar_td**td1 + + with pytest.raises(TypeError, match=pattern): + td1**scalar_td + + +def test_add_timestamp_to_timedelta(): + # GH: 35897 + timestamp = Timestamp("2021-01-01") + result = timestamp + timedelta_range("0s", "1s", periods=31) + expected = DatetimeIndex( + [ + timestamp + + ( + pd.to_timedelta("0.033333333s") * i + + pd.to_timedelta("0.000000001s") * divmod(i, 3)[0] + ) + for i in range(31) + ] + ) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4231c1e6dc6cfdc8085261e93acb9dea2f77248 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_ndarray_backed.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_ndarray_backed.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..751631403c4a82b29aae26894790e538ba823275 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_ndarray_backed.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_arithmetic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..0c4fcf149eb20bfa124d7387d1c6b243febdf14f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_arithmetic.py @@ -0,0 +1,139 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def data(): + """Fixture returning boolean array with valid and missing values.""" + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.fixture +def left_array(): + """Fixture returning boolean array with valid and missing values.""" + return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + + +@pytest.fixture +def right_array(): + """Fixture returning boolean array with valid and missing values.""" + return pd.array([True, False, None] * 3, dtype="boolean") + + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [ + ("add", [True, True, None, True, False, None, None, None, None]), + ("mul", [True, False, None, False, False, None, None, None, None]), + ], + ids=["add", "mul"], +) +def test_add_mul(left_array, right_array, opname, exp): + op = getattr(operator, opname) + result = op(left_array, right_array) + expected = pd.array(exp, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_sub(left_array, right_array): + msg = ( + r"numpy boolean subtract, the `-` operator, is (?:deprecated|not supported), " + r"use the bitwise_xor, the `\^` operator, or the logical_xor function instead\." + ) + with pytest.raises(TypeError, match=msg): + left_array - right_array + + +def test_div(left_array, right_array): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + # check that we are matching the non-masked Series behavior + pd.Series(left_array._data) / pd.Series(right_array._data) + + with pytest.raises(NotImplementedError, match=msg): + left_array / right_array + + +@pytest.mark.parametrize( + "opname", + [ + "floordiv", + "mod", + "pow", + ], +) +def test_op_int8(left_array, right_array, opname): + op = getattr(operator, opname) + if opname != "mod": + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + result = op(left_array, right_array) + return + result = op(left_array, right_array) + expected = op(left_array.astype("Int8"), right_array.astype("Int8")) + tm.assert_extension_array_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): + # invalid ops + + if using_infer_string: + import pyarrow as pa + + err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + err = TypeError + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + # invalid scalars + msg = ( + "did not contain a loop with signature matching types|" + "BooleanArray cannot perform the operation|" + "not supported for the input types, and the inputs could not be safely coerced " + "to any supported types according to the casting rule ''safe''" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + msg = "|".join( + [ + r"unsupported operand type\(s\) for", + "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + ] + ) + with pytest.raises(err, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + msg = "|".join( + [ + r"unsupported operand type\(s\) for", + "can only concatenate str", + "not all arguments converted during string formatting", + "has no kernel", + "not implemented", + ] + ) + with pytest.raises(err, match=msg): + ops(pd.Series("foo", index=s.index)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..932e903c0e448174e0b960e5ae504282cef61dc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_astype.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_comparison.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..2eeb9da574b1e7973d98390ada40f23f57526203 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_comparison.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.tests.arrays.masked_shared import ComparisonOps + + +@pytest.fixture +def data(): + """Fixture returning boolean array with valid and missing data""" + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.fixture +def dtype(): + """Fixture returning BooleanDtype""" + return pd.BooleanDtype() + + +class TestComparisonOps(ComparisonOps): + def test_compare_scalar(self, data, comparison_op): + self._compare_other(data, comparison_op, True) + + def test_compare_array(self, data, comparison_op): + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, comparison_op, other) + other = np.array([True] * len(data)) + self._compare_other(data, comparison_op, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, comparison_op, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, comparison_op, dtype): + ComparisonOps.test_scalar(self, other, comparison_op, dtype) + + def test_array(self, comparison_op): + op = comparison_op + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_construction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_construction.py new file mode 100644 index 0000000000000000000000000000000000000000..645e763fbf00cec4f62474fc7d2d2be564a4e4ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_construction.py @@ -0,0 +1,325 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values.shape must match mask.shape"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="values.shape must match mask.shape"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + msg = "Need to pass bool-like value" + with pytest.raises(TypeError, match=msg): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + # passing 2D values is OK as long as no mask + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="values.shape and mask.shape must match"): + coerce_to_array(values.reshape(1, -1), mask=mask) + + with pytest.raises(ValueError, match="values.shape and mask.shape must match"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + msg = ( + "cannot convert to 'bool'-dtype NumPy array with missing values. " + "Specify an appropriate 'na_value' for this dtype." + ) + with pytest.raises(ValueError, match=msg): + np.array(arr, dtype="bool") + + +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object), + dtype="boolean", + ) + expected = BooleanArray( + np.array([True, False, True, True, False, False, False]), + np.array([False, False, False, False, False, False, True]), + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean") + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) + + # no missing values -> can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_function.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_function.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3f3d3d16ac6c49d231ac526fa89570975e4bfb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_function.py @@ -0,0 +1,126 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + msg = r"operand type\(s\) all returned NotImplemented from __array_ufunc__" + with pytest.raises(TypeError, match=msg): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + ser = pd.Series(a) + result = ufunc(ser) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +def test_ufunc_numeric(): + # np.sqrt on np.bool_ returns float16, which we upcast to Float32 + # bc we do not have Float16 + arr = pd.array([True, False, None], dtype="boolean") + + res = np.sqrt(arr) + + expected = pd.array([1, 0, None], dtype="Float32") + tm.assert_extension_array_equal(res, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + arr = pd.array(values, dtype="boolean") + + res = np.add.reduce(arr) + if arr[-1] is pd.NA: + expected = pd.NA + else: + expected = arr._data.sum() + tm.assert_almost_equal(res, expected) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count") + assert expected.index.dtype == arr.dtype + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count") + assert expected.index.dtype == arr.dtype + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_normalize(): + ser = pd.Series([True, False, pd.NA], dtype="boolean") + result = ser.value_counts(normalize=True) + expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2 + assert expected.index.dtype == "boolean" + tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + ser = pd.Series(a) + result = ser.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..6a7daea16963c99fb7c4bbcd4b122d6af53d2576 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_indexing.py @@ -0,0 +1,13 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_logical.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_logical.py new file mode 100644 index 0000000000000000000000000000000000000000..66c117ea3fc66cbc5f847cc96c23e80a98329c5d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_logical.py @@ -0,0 +1,254 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.ops.mask_ops import ( + kleene_and, + kleene_or, + kleene_xor, +) +from pandas.tests.extension.base import BaseOpsUtil + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool_(True))) + tm.assert_extension_array_equal(op(False), op(np.bool_(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(pd.NA) + tm.assert_extension_array_equal(a, result) + + @pytest.mark.parametrize( + "other", ["a", pd.Timestamp(2017, 1, 1, 12), np.timedelta64(4)] + ) + def test_eq_mismatched_type(self, other): + # GH-44499 + arr = pd.array([True, False]) + result = arr == other + expected = pd.array([False, False]) + tm.assert_extension_array_equal(result, expected) + + result = arr != other + expected = pd.array([True, True]) + tm.assert_extension_array_equal(result, expected) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3]) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("operation", [kleene_or, kleene_xor, kleene_and]) +def test_error_both_scalar(operation): + msg = r"Either `left` or `right` need to be a np\.ndarray." + with pytest.raises(TypeError, match=msg): + # masks need to be non-None, otherwise it ends up in an infinite recursion + operation(True, True, np.zeros(1), np.zeros(1)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..95ebe8528c2e5fec1a580b00bd79e0617fe7609f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_ops.py @@ -0,0 +1,27 @@ +import pandas as pd +import pandas._testing as tm + + +class TestUnaryOps: + def test_invert(self): + a = pd.array([True, False, None], dtype="boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(~a, expected) + + expected = pd.Series(expected, index=["a", "b", "c"], name="name") + result = ~pd.Series(a, index=["a", "b", "c"], name="name") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) + result = ~df + expected = pd.DataFrame( + {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] + ) + tm.assert_frame_equal(result, expected) + + def test_abs(self): + # matching numpy behavior, abs is the identity function + arr = pd.array([True, False, None], dtype="boolean") + result = abs(arr) + + tm.assert_extension_array_equal(result, arr) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_reduction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..dd8c3eda9ed05b6844c90024631a1e90f755069e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_reduction.py @@ -0,0 +1,62 @@ +import numpy as np +import pytest + +import pandas as pd + + +@pytest.fixture +def data(): + """Fixture returning boolean array, with valid and missing values.""" + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + # GH-33253: all True / all False values buggy with skipna=False + ([True, True], True, True, True, True), + ([False, False], False, False, False, False), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int_) + elif op == "count": + # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) + assert isinstance(getattr(s, op)(), np.integer) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_repr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_repr.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee904b18cc9ec6197ed3ad009fae1da593c5219 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/test_repr.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4edd97290980d92472c2ccd906f42d3d45ea5d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_algos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_algos.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97fb744223cb56aaf8da0bd39489837606bd1d06 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_algos.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_analytics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_analytics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1de86bb39386e05a4658a908909f0ab5fc0dd603 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_analytics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d17cf0134c26053c956c76b08adf64e9e53e959d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9dd77317e24ecf81ce621b5fe75c0b9b21d7a1c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa5b67e7045be279b1c6532c6e6437e590421137 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6f2185d02b6a6c85a7bf562234b33efda69062c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69ee838b9516ea2c7bce89ad1895dc167ed2026a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_map.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_map.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..067938fad711579930f82d3aa35312e0704a42d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_map.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b4fc277ab2e7cc16614cdbcc9a37d2ad1921255 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_operators.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_operators.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9c12d672d5f83be9dcec6abc01c73e0613ce9c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_operators.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_replace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_replace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3280965969d3a74ec2448847e4953e4bbe67811 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_replace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_repr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_repr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22570de34fdde7600e558e7aee876d80183a416e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_repr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_sorting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_sorting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25cd9028a68aaee54d981b258ca11861b6778970 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_sorting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_subclass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_subclass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0abb55c94872ac2fbb3fedaee41cd722efb8d07 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_subclass.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_take.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_take.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a61387454b1b815c83dab7a65eb44839902abc0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_take.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_warnings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_warnings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d8534341e6f58b7b0598fd2fdedaff34956fe3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__pycache__/test_warnings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc3b525886852deffc2840b9ebda8cf181bda744 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4b0351b4a279fead096206f6debfb98af99e6ea Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_cumulative.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_cumulative.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92a143cac3e203431e8e580dc97e59fe2afa4f2a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_cumulative.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db3d4b9dc329dab858c394883f64a41e24e3082d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__pycache__/test_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93651965f96e4b5d78ccd916caab32652e8cb844 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0dcd1c9a30ceb804faa748c1cb7b6a75f3e3a61f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1959758898caafa10b1360c997da2a17fc96906 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3632b7a69147ac51ee8b842b64c12796b9bdf489 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_comparison.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_comparison.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7483503ab705d17c46450dab868eb15753f47b8e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_comparison.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b17832fbafeab79e28e60b64de9aeda6357673b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_construction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_construction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84ed046b0a8cac12f85c1073d0c3f79ac6e57e60 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_construction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_contains.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_contains.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e1aaa2c38c84dbd029d4e71e99be898f4e25bb2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_contains.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_function.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_function.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..455c85df231390019d34e9de1380f1d6cc89cd9b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_function.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_repr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_repr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc0595438c1229ceffef7abf1833c54c17052362 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_repr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_to_numpy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_to_numpy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb363a011b3267645cf707aeda1a98d4c840c783 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__pycache__/test_to_numpy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9666e901c2bb3882363fb59981d31f8ca4ca14b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0d4f04662d9bdc9e0971886df1cfdf0d596b237 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eabad5eaf1b1d174a7cff99d1d195e049e1dd3ee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_comparison.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_comparison.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efce5554e73ab913fbed64486240342cbe73165d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_comparison.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f9b02ad220384bb05f4394685be48a5eebf70dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_construction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_construction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1181bdbcc658e9bb6b6fea47afb4b2703b93707 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_construction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0734e42ca37c653ab9147b7fdd30e8c0304244d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_function.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_function.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b26d23e97fe2298d4785c6f25964acc5ba2005a6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_function.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13e5d0ab807501f5935100fdd8e5fb1a409ca6f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_reduction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_reduction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da07015e813012854d393d0264ca744d7e4e29d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_reduction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_repr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_repr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1cf4eabb77c546f4bd40cd19e0607f7b232c252 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__pycache__/test_repr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a9aa4c2e1fecf4b780605b18ed483869476d400 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd137ecb1c0e39b9d9fa1f78edb1b9883caefd01 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..582aaab43839232450e358039f256da5c3fdce6f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d657800634c964024e3aa74f5a5cd1b0d8a60e6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval_pyarrow.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval_pyarrow.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd3f6c375b9b13d4c68431e7aca11aaaedd5bf70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_interval_pyarrow.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_overlaps.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_overlaps.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c48e66f55ff9d4caf7c764e7bd0a63a77f837137 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__pycache__/test_overlaps.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ee08f4c335b44ccad2fa440168b0399a21ed76e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43fde9682dca77ef16e5927612850a53d62f1f20 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arrow_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arrow_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44d151fd2c29892b39ced4a9c1b3ba6083a42a64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_arrow_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_function.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_function.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54743d2a010a432daf2c4c6a554b2317ebccb58b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_function.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b713ed8d1bd9d99aa89fd43a545966d974e53a77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43da6944a81cc2f735156ba29a7377d3f39f9fa2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..013b33d55591ac6ecb3f8b367b24e3009ec68e54 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_numpy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_numpy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd841b428d3a73ca8e5eaab4e2f314f6c94ace66 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__pycache__/test_numpy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03c2f101cdc57baa2311f0ee94a5c41b176b5353 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_arrow_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_arrow_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1bee4dd8a4d35f3bd690061e3f8bc63113d1963 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_arrow_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..880ae28c741005cefc46833fde35e3ea2fa89f51 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5786f393e97b72904407bcf974582aad8fd76c66 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e37bf8b0f6daa9556309e701b1c1fde4ca92b0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/__pycache__/test_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_arrow_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_arrow_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..431309aca0df21dbe885ae015b10c3c21f0134a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_arrow_compat.py @@ -0,0 +1,130 @@ +import pytest + +from pandas.compat.pyarrow import pa_version_under10p1 + +from pandas.core.dtypes.dtypes import PeriodDtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import ( + PeriodArray, + period_array, +) + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +pa = pytest.importorskip("pyarrow") + + +def test_arrow_extension_type(): + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert p1 != p3 + assert hash(p1) == hash(p2) + assert hash(p1) != hash(p3) + + +@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10") +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="YE"), "Y-DEC"), + ], +) +def test_arrow_array(data, freq): + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + periods = period_array(data, freq=freq) + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + + # unsupported conversions + msg = "Not supported to convert PeriodArray to 'double' type" + with pytest.raises(TypeError, match=msg): + pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +def test_arrow_array_missing(): + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], dtype="period[D]") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) + + +def test_arrow_table_roundtrip(): + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], dtype="period[D]") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + + +def test_arrow_load_from_zero_chunks(): + # GH-41040 + + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + arr = PeriodArray([], dtype="period[D]") + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + +def test_arrow_table_roundtrip_without_metadata(): + arr = PeriodArray([1, 2, 3], dtype="period[h]") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + # remove the metadata + table = table.replace_schema_metadata() + assert table.schema.metadata is None + + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..9976c3a32580da0b5b237eaa2b839b2337363f51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_astype.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import PeriodDtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import period_array + + +@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) +def test_astype_int(dtype): + # We choose to ignore the sign and size of integers for + # Period/Datetime/Timedelta astype + arr = period_array(["2000", "2001", None], freq="D") + + if np.dtype(dtype) != np.int64: + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype(dtype) + return + + result = arr.astype(dtype) + expected = arr._ndarray.view("i8") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_copies(): + arr = period_array(["2000", "2001", None], freq="D") + result = arr.astype(np.int64, copy=False) + + # Add the `.base`, since we now use `.asi8` which returns a view. + # We could maybe override it in PeriodArray to return ._ndarray directly. + assert result.base is arr._ndarray + + result = arr.astype(np.int64, copy=True) + assert result is not arr._ndarray + tm.assert_numpy_array_equal(result, arr._ndarray.view("i8")) + + +def test_astype_categorical(): + arr = period_array(["2000", "2001", "2001", None], freq="D") + result = arr.astype("category") + categories = pd.PeriodIndex(["2000", "2001"], freq="D") + expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) + tm.assert_categorical_equal(result, expected) + + +def test_astype_period(): + arr = period_array(["2000", "2001", None], freq="D") + result = arr.astype(PeriodDtype("M")) + expected = period_array(["2000", "2001", None], freq="M") + tm.assert_period_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +def test_astype_datetime(dtype): + arr = period_array(["2000", "2001", None], freq="D") + # slice off the [ns] so that the regex matches. + if dtype == "timedelta64[ns]": + with pytest.raises(TypeError, match=dtype[:-4]): + arr.astype(dtype) + + else: + # GH#45038 allow period->dt64 because we allow dt64->period + result = arr.astype(dtype) + expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data + tm.assert_datetime_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..d034162f1b46e11bd06204de7707c7343fd9b1b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_constructors.py @@ -0,0 +1,156 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT +from pandas._libs.tslibs.offsets import MonthEnd +from pandas._libs.tslibs.period import IncompatibleFrequency + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import ( + PeriodArray, + period_array, +) + + +@pytest.mark.parametrize( + "data, freq, expected", + [ + ([pd.Period("2017", "D")], None, [17167]), + ([pd.Period("2017", "D")], "D", [17167]), + ([2017], "D", [17167]), + (["2017"], "D", [17167]), + ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), + ([pd.Period("2017", "D"), None], None, [17167, iNaT]), + (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), + (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + (pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]), + ], +) +def test_period_array_ok(data, freq, expected): + result = period_array(data, freq=freq).asi8 + expected = np.asarray(expected, dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + +def test_period_array_readonly_object(): + # https://github.com/pandas-dev/pandas/issues/25403 + pa = period_array([pd.Period("2019-01-01")]) + arr = np.asarray(pa, dtype="object") + arr.setflags(write=False) + + result = period_array(arr) + tm.assert_period_array_equal(result, pa) + + result = pd.Series(arr) + tm.assert_series_equal(result, pd.Series(pa)) + + result = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, pd.DataFrame({"A": pa})) + + +def test_from_datetime64_freq_changes(): + # https://github.com/pandas-dev/pandas/issues/23438 + arr = pd.date_range("2017", periods=3, freq="D") + result = PeriodArray._from_datetime64(arr, freq="M") + expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M") + tm.assert_period_array_equal(result, expected) + + +@pytest.mark.parametrize("freq", ["2M", MonthEnd(2)]) +def test_from_datetime64_freq_2M(freq): + arr = np.array( + ["2020-01-01T00:00:00", "2020-01-02T00:00:00"], dtype="datetime64[ns]" + ) + result = PeriodArray._from_datetime64(arr, freq) + expected = period_array(["2020-01", "2020-01"], freq=freq) + tm.assert_period_array_equal(result, expected) + + +@pytest.mark.parametrize( + "data, freq, msg", + [ + ( + [pd.Period("2017", "D"), pd.Period("2017", "Y")], + None, + "Input has different freq", + ), + ([pd.Period("2017", "D")], "Y", "Input has different freq"), + ], +) +def test_period_array_raises(data, freq, msg): + with pytest.raises(IncompatibleFrequency, match=msg): + period_array(data, freq) + + +def test_period_array_non_period_series_raies(): + ser = pd.Series([1, 2, 3]) + with pytest.raises(TypeError, match="dtype"): + PeriodArray(ser, dtype="period[D]") + + +def test_period_array_freq_mismatch(): + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, dtype="period[M]") + + dtype = pd.PeriodDtype(pd.tseries.offsets.MonthEnd()) + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, dtype=dtype) + + +def test_from_sequence_disallows_i8(): + arr = period_array(["2000", "2001"], freq="D") + + msg = str(arr[0].ordinal) + with pytest.raises(TypeError, match=msg): + PeriodArray._from_sequence(arr.asi8, dtype=arr.dtype) + + with pytest.raises(TypeError, match=msg): + PeriodArray._from_sequence(list(arr.asi8), dtype=arr.dtype) + + +def test_from_td64nat_sequence_raises(): + # GH#44507 + td = pd.NaT.to_numpy("m8[ns]") + + dtype = pd.period_range("2005-01-01", periods=3, freq="D").dtype + + arr = np.array([None], dtype=object) + arr[0] = td + + msg = "Value must be Period, string, integer, or datetime" + with pytest.raises(ValueError, match=msg): + PeriodArray._from_sequence(arr, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + pd.PeriodIndex(arr, dtype=dtype) + with pytest.raises(ValueError, match=msg): + pd.Index(arr, dtype=dtype) + with pytest.raises(ValueError, match=msg): + pd.array(arr, dtype=dtype) + with pytest.raises(ValueError, match=msg): + pd.Series(arr, dtype=dtype) + with pytest.raises(ValueError, match=msg): + pd.DataFrame(arr, dtype=dtype) + + +def test_freq_deprecated(): + # GH#52462 + data = np.arange(5).astype(np.int64) + msg = "The 'freq' keyword in the PeriodArray constructor is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = PeriodArray(data, freq="M") + + expected = PeriodArray(data, dtype="period[M]") + tm.assert_equal(res, expected) + + +def test_period_array_from_datetime64(): + arr = np.array( + ["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]" + ) + result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2)) + + expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2)) + tm.assert_period_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_reductions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..2889cc786dd71583ca345ad206553907af3a13fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/period/test_reductions.py @@ -0,0 +1,42 @@ +import pytest + +import pandas as pd +from pandas.core.arrays import period_array + + +class TestReductions: + def test_min_max(self): + arr = period_array( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + freq="D", + ) + + result = arr.min() + expected = pd.Period("2000-01-02", freq="D") + assert result == expected + + result = arr.max() + expected = pd.Period("2000-01-05", freq="D") + assert result == expected + + result = arr.min(skipna=False) + assert result is pd.NaT + + result = arr.max(skipna=False) + assert result is pd.NaT + + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_empty(self, skipna): + arr = period_array([], freq="D") + result = arr.min(skipna=skipna) + assert result is pd.NaT + + result = arr.max(skipna=skipna) + assert result is pd.NaT diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cefcdfe35c1248c0d86cca3d2d34e75ad7fb9c8d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_accessor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_accessor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..001ba1e6cf56efe78a2dc919000e70bacb2f7eb3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_accessor.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_arithmetics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_arithmetics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..018b1d0d38dac742a18dba61aa9f34d0068acf34 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_arithmetics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..844e3b7441451ef6ddfe88738386b1e0065b6f89 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65af8cff6cc4ab677117019f11f05272a5388294 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_combine_concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_combine_concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e98d0dffadcedc4eb8dbcd22754cecc848db5d12 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_combine_concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..015fd79612c42414ebddb4a5b9fb15e8d5ffaad2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_dtype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_dtype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c0b17f027d79610d07c1dcd17f836754503f08a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_dtype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f7fe075a5da70207aceea40fdcf7b2ad4f18cce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_libsparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_libsparse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a45bbbbc12533c7a42c8740d1bbdce468353d4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_libsparse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98dd79983426a98c05532b4972249a822ebd2a0f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_unary.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_unary.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f504d79911cb2369d7e20e5efd226d554962b014 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__pycache__/test_unary.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27c15dd2ced00b74e127053e1d7582d3a2f4bbf7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string.py new file mode 100644 index 0000000000000000000000000000000000000000..320bdca60a9327bee90ffde5483e735829aaa2c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string.py @@ -0,0 +1,703 @@ +""" +This module tests the functionality of StringArray and ArrowStringArray. +Tests for the str accessors are in pandas/tests/strings/test_string_array.py +""" +import operator + +import numpy as np +import pytest + +from pandas.compat.pyarrow import pa_version_under12p0 + +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) + + +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA + + +@pytest.fixture +def dtype(string_storage): + """Fixture giving StringDtype from parametrized 'string_storage'""" + return pd.StringDtype(storage=string_storage) + + +@pytest.fixture +def cls(dtype): + """Fixture giving array type from parametrized 'dtype'""" + return dtype.construct_array_type() + + +def test_repr(dtype): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + else: + arr_name = "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected + + +def test_none_to_nan(cls, dtype): + a = cls._from_sequence(["a", None, "b"], dtype=dtype) + assert a[1] is not None + assert a[1] is na_val(a.dtype) + + +def test_setitem_validates(cls, dtype): + arr = cls._from_sequence(["a", "b"], dtype=dtype) + + if cls is pd.arrays.StringArray: + msg = "Cannot set non-string value '10' into a StringArray." + else: + msg = "Scalar must be NA or str" + with pytest.raises(TypeError, match=msg): + arr[0] = 10 + + if cls is pd.arrays.StringArray: + msg = "Must provide strings." + else: + msg = "Scalar must be NA or str" + with pytest.raises(TypeError, match=msg): + arr[:] = np.array([1, 2]) + + +def test_setitem_with_scalar_string(dtype): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype=dtype) + arr[0] = "d" + expected = pd.array(["d", "c"], dtype=dtype) + tm.assert_extension_array_equal(arr, expected) + + +def test_setitem_with_array_with_missing(dtype): + # ensure that when setting with an array of values, we don't mutate the + # array `value` in __setitem__(self, key, value) + arr = pd.array(["a", "b", "c"], dtype=dtype) + value = np.array(["A", None]) + value_orig = value.copy() + arr[[0, 1]] = value + + expected = pd.array(["A", pd.NA, "c"], dtype=dtype) + tm.assert_extension_array_equal(arr, expected) + tm.assert_numpy_array_equal(value, value_orig) + + +def test_astype_roundtrip(dtype): + ser = pd.Series(pd.date_range("2000", periods=12)) + ser[0] = None + + casted = ser.astype(dtype) + assert is_dtype_equal(casted.dtype, dtype) + + result = casted.astype("datetime64[ns]") + tm.assert_series_equal(result, ser) + + # GH#38509 same thing for timedelta64 + ser2 = ser - ser.iloc[-1] + casted2 = ser2.astype(dtype) + assert is_dtype_equal(casted2.dtype, dtype) + + result2 = casted2.astype(ser2.dtype) + tm.assert_series_equal(result2, ser2) + + +def test_add(dtype): + a = pd.Series(["a", "b", "c", None, None], dtype=dtype) + b = pd.Series(["x", "y", None, "z", None], dtype=dtype) + + result = a + b + expected = pd.Series(["ax", "by", None, None, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = a.add(b) + tm.assert_series_equal(result, expected) + + result = a.radd(b) + expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = a.add(b, fill_value="-") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_add_2d(dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage: + reason = "Failed: DID NOT RAISE " + mark = pytest.mark.xfail(raises=None, reason=reason) + request.applymarker(mark) + + a = pd.array(["a", "b", "c"], dtype=dtype) + b = np.array([["a", "b", "c"]], dtype=object) + with pytest.raises(ValueError, match="3 != 1"): + a + b + + s = pd.Series(a) + with pytest.raises(ValueError, match="3 != 1"): + s + b + + +def test_add_sequence(dtype): + a = pd.array(["a", "b", None, None], dtype=dtype) + other = ["x", None, "y", None] + + result = a + other + expected = pd.array(["ax", None, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = other + a + expected = pd.array(["xa", None, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_mul(dtype): + a = pd.array(["a", "b", None], dtype=dtype) + result = a * 2 + expected = pd.array(["aa", "bb", None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = 2 * a + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-28527") +def test_add_strings(dtype): + arr = pd.array(["a", "b", "c", "d"], dtype=dtype) + df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) + assert arr.__add__(df) is NotImplemented + + result = arr + df + expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype) + tm.assert_frame_equal(result, expected) + + result = df + arr + expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-28527") +def test_add_frame(dtype): + arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) + df = pd.DataFrame([["x", np.nan, "y", np.nan]]) + + assert arr.__add__(df) is NotImplemented + + result = arr + df + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) + tm.assert_frame_equal(result, expected) + + result = df + arr + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) + tm.assert_frame_equal(result, expected) + + +def test_comparison_methods_scalar(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + a = pd.array(["a", None, "c"], dtype=dtype) + other = "a" + result = getattr(a, op_name)(other) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a]) + if comparison_op == operator.ne: + expected[1] = True + else: + expected[1] = False + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_scalar_pd_na(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + a = pd.array(["a", None, "c"], dtype=dtype) + result = getattr(a, op_name)(pd.NA) + + if dtype.storage == "pyarrow_numpy": + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_scalar_not_string(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + + a = pd.array(["a", None, "c"], dtype=dtype) + other = 42 + + if op_name not in ["__eq__", "__ne__"]: + with pytest.raises(TypeError, match="Invalid comparison|not supported between"): + getattr(a, op_name)(other) + + return + + result = getattr(a, op_name)(other) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, True, True], + }[op_name] + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_array(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + + a = pd.array(["a", None, "c"], dtype=dtype) + other = [None, None, "c"] + result = getattr(a, op_name)(other) + if dtype.storage == "pyarrow_numpy": + if operator.ne == comparison_op: + expected = np.array([True, True, False]) + else: + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_constructor_raises(cls): + if cls is pd.arrays.StringArray: + msg = "StringArray requires a sequence of strings or pandas.NA" + else: + msg = "Unsupported type '' for ArrowExtensionArray" + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", "b"], dtype="S1")) + + with pytest.raises(ValueError, match=msg): + cls(np.array([])) + + if cls is pd.arrays.StringArray: + # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs + # for string dtype + cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", None], dtype=object)) + else: + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.nan], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", None], dtype=object)) + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", pd.NaT], dtype=object)) + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object)) + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object)) + + +@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequence_no_mutate(copy, cls, dtype): + nan_arr = np.array(["a", np.nan], dtype=object) + expected_input = nan_arr.copy() + na_arr = np.array(["a", pd.NA], dtype=object) + + result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy) + + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): + import pyarrow as pa + + expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + else: + expected = cls(na_arr) + + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(nan_arr, expected_input) + + +def test_astype_int(dtype): + arr = pd.array(["1", "2", "3"], dtype=dtype) + result = arr.astype("int64") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array(["1", pd.NA, "3"], dtype=dtype) + if dtype.storage == "pyarrow_numpy": + err = ValueError + msg = "cannot convert float NaN to integer" + else: + err = TypeError + msg = ( + r"int\(\) argument must be a string, a bytes-like " + r"object or a( real)? number" + ) + with pytest.raises(err, match=msg): + arr.astype("int64") + + +def test_astype_nullable_int(dtype): + arr = pd.array(["1", pd.NA, "3"], dtype=dtype) + + result = arr.astype("Int64") + expected = pd.array([1, pd.NA, 3], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_float(dtype, any_float_dtype): + # Don't compare arrays (37974) + ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) + result = ser.astype(any_float_dtype) + expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented StringArray.sum") +def test_reduce(skipna, dtype): + arr = pd.Series(["a", "b", "c"], dtype=dtype) + result = arr.sum(skipna=skipna) + assert result == "abc" + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented StringArray.sum") +def test_reduce_missing(skipna, dtype): + arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) + result = arr.sum(skipna=skipna) + if skipna: + assert result == "abc" + else: + assert pd.isna(result) + + +@pytest.mark.parametrize("method", ["min", "max"]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_min_max(method, skipna, dtype): + arr = pd.Series(["a", "b", "c", None], dtype=dtype) + result = getattr(arr, method)(skipna=skipna) + if skipna: + expected = "a" if method == "min" else "c" + assert result == expected + else: + assert result is na_val(arr.dtype) + + +@pytest.mark.parametrize("method", ["min", "max"]) +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage and box is pd.array: + if box is pd.array: + reason = "'<=' not supported between instances of 'str' and 'NoneType'" + else: + reason = "'ArrowStringArray' object has no attribute 'max'" + mark = pytest.mark.xfail(raises=TypeError, reason=reason) + request.applymarker(mark) + + arr = box(["a", "b", "c", None], dtype=dtype) + result = getattr(np, method)(arr) + expected = "a" if method == "min" else "c" + assert result == expected + + +def test_fillna_args(dtype, arrow_string_storage): + # GH 37987 + + arr = pd.array(["a", pd.NA], dtype=dtype) + + res = arr.fillna(value="b") + expected = pd.array(["a", "b"], dtype=dtype) + tm.assert_extension_array_equal(res, expected) + + res = arr.fillna(value=np.str_("b")) + expected = pd.array(["a", "b"], dtype=dtype) + tm.assert_extension_array_equal(res, expected) + + if dtype.storage in arrow_string_storage: + msg = "Invalid value '1' for dtype string" + else: + msg = "Cannot set non-string value '1' into a StringArray." + with pytest.raises(TypeError, match=msg): + arr.fillna(value=1) + + +def test_arrow_array(dtype): + # protocol added in 0.15.0 + pa = pytest.importorskip("pyarrow") + import pyarrow.compute as pc + + data = pd.array(["a", "b", "c"], dtype=dtype) + arr = pa.array(data) + expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + expected = pa.chunked_array(expected) + if dtype.storage == "python": + expected = pc.cast(expected, pa.string()) + assert arr.equals(expected) + + +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): + # roundtrip possible from arrow 1.0.0 + pa = pytest.importorskip("pyarrow") + + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + + data = pd.array(["a", "b", None], dtype=dtype) + df = pd.DataFrame({"a": data}) + table = pa.table(df) + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is na_val(result["a"].dtype) + + +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string +): + # GH-41040 + pa = pytest.importorskip("pyarrow") + + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + + data = pd.array([], dtype=dtype) + df = pd.DataFrame({"a": data}) + table = pa.table(df) + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" + # Instantiate the same table with no chunks at all + table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) + + +def test_value_counts_na(dtype): + if getattr(dtype, "storage", "") == "pyarrow": + exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" + else: + exp_dtype = "Int64" + arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_normalize(dtype): + if getattr(dtype, "storage", "") == "pyarrow": + exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 + else: + exp_dtype = "Float64" + ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, expected", + [ + (["a", "b", "c"], np.array([False, False, False])), + (["a", "b", None], np.array([False, False, True])), + ], +) +def test_use_inf_as_na(values, expected, dtype): + # https://github.com/pandas-dev/pandas/issues/33655 + values = pd.array(values, dtype=dtype) + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + result = values.isna() + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(values).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + result = pd.DataFrame(values).isna() + expected = pd.DataFrame(expected) + tm.assert_frame_equal(result, expected) + + +def test_memory_usage(dtype, arrow_string_storage): + # GH 33963 + + if dtype.storage in arrow_string_storage: + pytest.skip(f"not applicable for {dtype.storage}") + + series = pd.Series(["a", "b", "c"], dtype=dtype) + + assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) + + +@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) +def test_astype_from_float_dtype(float_dtype, dtype): + # https://github.com/pandas-dev/pandas/issues/36451 + ser = pd.Series([0.1], dtype=float_dtype) + result = ser.astype(dtype) + expected = pd.Series(["0.1"], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_to_numpy_returns_pdna_default(dtype): + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = np.array(arr) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_na_value(dtype, nulls_fixture): + na_value = nulls_fixture + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = arr.to_numpy(na_value=na_value) + expected = np.array(["a", na_value, "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_isin(dtype, fixed_now_ts): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(["a", "c"]) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(["a", pd.NA]) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + result = s.isin([]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(["a", fixed_now_ts]) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + +def test_setitem_scalar_with_mask_validation(dtype): + # https://github.com/pandas-dev/pandas/issues/47628 + # setting None with a boolean mask (through _putmaks) should still result + # in pd.NA values in the underlying array + ser = pd.Series(["a", "b", "c"], dtype=dtype) + mask = np.array([False, True, False]) + + ser[mask] = None + assert ser.array[1] is na_val(ser.dtype) + + # for other non-string we should also raise an error + ser = pd.Series(["a", "b", "c"], dtype=dtype) + if type(ser.array) is pd.arrays.StringArray: + msg = "Cannot set non-string value" + else: + msg = "Scalar must be NA or str" + with pytest.raises(TypeError, match=msg): + ser[mask] = 1 + + +def test_from_numpy_str(dtype): + vals = ["a", "b", "c"] + arr = np.array(vals, dtype=np.str_) + result = pd.array(arr, dtype=dtype) + expected = pd.array(vals, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_tolist(dtype): + vals = ["a", "b", "c"] + arr = pd.array(vals, dtype=dtype) + result = arr.tolist() + expected = vals + tm.assert_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string_arrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..d7811b6fed8838cb792cc6faf2354d2b4237d4de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,265 @@ +import pickle +import re + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) + + +def test_eq_all_na(): + pytest.importorskip("pyarrow") + a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean[pyarrow]") + tm.assert_extension_array_equal(result, expected) + + +def test_config(string_storage, request, using_infer_string): + if using_infer_string and string_storage != "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + with pd.option_context("string_storage", string_storage): + assert StringDtype().storage == string_storage + result = pd.array(["a", "b"]) + assert result.dtype.storage == string_storage + + dtype = StringDtype(string_storage) + expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) + tm.assert_equal(result, expected) + + +def test_config_bad_storage_raises(): + msg = re.escape("Value must be one of python|pyarrow") + with pytest.raises(ValueError, match=msg): + pd.options.mode.string_storage = "foo" + + +@pytest.mark.parametrize("chunked", [True, False]) +@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): + pa = pytest.importorskip("pyarrow") + + array = pa if array in arrow_string_storage else np + + arr = array.array([1, 2, 3]) + if chunked: + if array is np: + pytest.skip("chunked not applicable to numpy array") + arr = pa.chunked_array(arr) + if array is np: + msg = "Unsupported type '' for ArrowExtensionArray" + else: + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) + + +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_not_string_type_value_dictionary_raises(chunked): + pa = pytest.importorskip("pyarrow") + + arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32())) + if chunked: + arr = pa.chunked_array(arr) + + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) + + +@pytest.mark.xfail( + reason="dict conversion does not seem to be implemented for large string in arrow" +) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_type_value_dictionary(chunked): + pa = pytest.importorskip("pyarrow") + + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + assert pa.types.is_string(arr._pa_array.type.value_type) + + +def test_constructor_from_list(): + # GH#27673 + pytest.importorskip("pyarrow") + result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, StringDtype) + assert result.dtype.storage == "pyarrow" + + +def test_from_sequence_wrong_dtype_raises(using_infer_string): + pytest.importorskip("pyarrow") + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence( + ["a", None, "c"], dtype=StringDtype("python") + ) + + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + StringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + if not using_infer_string: + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + +@td.skip_if_installed("pyarrow") +def test_pyarrow_not_installed_raises(): + msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed") + + with pytest.raises(ImportError, match=msg): + StringDtype(storage="pyarrow") + + with pytest.raises(ImportError, match=msg): + ArrowStringArray([]) + + with pytest.raises(ImportError, match=msg): + ArrowStringArrayNumpySemantics([]) + + with pytest.raises(ImportError, match=msg): + ArrowStringArray._from_sequence(["a", None, "b"]) + + +@pytest.mark.parametrize("multiple_chunks", [False, True]) +@pytest.mark.parametrize( + "key, value, expected", + [ + (-1, "XX", ["a", "b", "c", "d", "XX"]), + (1, "XX", ["a", "XX", "c", "d", "e"]), + (1, None, ["a", None, "c", "d", "e"]), + (1, pd.NA, ["a", None, "c", "d", "e"]), + ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]), + ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]), + ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]), + ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]), + ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]), + (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]), + (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]), + (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]), + (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]), + ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ], +) +def test_setitem(multiple_chunks, key, value, expected): + pa = pytest.importorskip("pyarrow") + + result = pa.array(list("abcde")) + expected = pa.array(expected) + + if multiple_chunks: + result = pa.chunked_array([result[:3], result[3:]]) + expected = pa.chunked_array([expected[:3], expected[3:]]) + + result = ArrowStringArray(result) + expected = ArrowStringArray(expected) + + result[key] = value + tm.assert_equal(result, expected) + + +def test_setitem_invalid_indexer_raises(): + pa = pytest.importorskip("pyarrow") + + arr = ArrowStringArray(pa.array(list("abcde"))) + + with pytest.raises(IndexError, match=None): + arr[5] = "foo" + + with pytest.raises(IndexError, match=None): + arr[-6] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, 5]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, -6]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[True, True, False]] = "foo" + + with pytest.raises(ValueError, match=None): + arr[[0, 1]] = ["foo", "bar", "baz"] + + +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): + # GH 42600 + pytest.importorskip("pyarrow") + expected = pd.Series(range(10), dtype=dtype) + expected_sliced = expected.head(2) + full_pickled = pickle.dumps(expected) + sliced_pickled = pickle.dumps(expected_sliced) + + assert len(full_pickled) > len(sliced_pickled) + + result = pickle.loads(full_pickled) + tm.assert_series_equal(result, expected) + + result_sliced = pickle.loads(sliced_pickled) + tm.assert_series_equal(result_sliced, expected_sliced) + + +def test_string_dtype_error_message(): + # GH#55051 + pytest.importorskip("pyarrow") + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f31b3fd4672b539a429a0c40a2872f9702e43ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..154fbf861b314626107078f60f873f80359a9d12 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_cumulative.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_cumulative.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..023eb0464fd4ee3437dfe0d485966214fc2ebe17 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_cumulative.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfd9bef159ae82729cbeb57efa004274986adcec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__pycache__/test_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/computation/__pycache__/test_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/computation/__pycache__/test_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..667c18236a8213395859bfa87273da5cbbb53d90 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/computation/__pycache__/test_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d752e12230b3484e9edd8f5106c4e1b5b8d70930 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_config.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7c346c949f34ca2d88806d6eb8836c747961af3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_config.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_localization.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_localization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22012d2df464efbf5586b33240b8c86fc40d0c4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/__pycache__/test_localization.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_config.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f49ae942423992f6dbb209e8f931f091e900ba12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_config.py @@ -0,0 +1,437 @@ +import pytest + +from pandas._config import config as cf +from pandas._config.config import OptionError + +import pandas as pd +import pandas._testing as tm + + +class TestConfig: + @pytest.fixture(autouse=True) + def clean_config(self, monkeypatch): + with monkeypatch.context() as m: + m.setattr(cf, "_global_config", {}) + m.setattr(cf, "options", cf.DictWrapper(cf._global_config)) + m.setattr(cf, "_deprecated_options", {}) + m.setattr(cf, "_registered_options", {}) + + # Our test fixture in conftest.py sets "chained_assignment" + # to "raise" only after all test methods have been setup. + # However, after this setup, there is no longer any + # "chained_assignment" option, so re-register it. + cf.register_option("chained_assignment", "raise") + yield + + def test_api(self): + # the pandas object exposes the user API + assert hasattr(pd, "get_option") + assert hasattr(pd, "set_option") + assert hasattr(pd, "reset_option") + assert hasattr(pd, "describe_option") + + def test_is_one_of_factory(self): + v = cf.is_one_of_factory([None, 12]) + + v(12) + v(None) + msg = r"Value must be one of None\|12" + with pytest.raises(ValueError, match=msg): + v(1.1) + + def test_register_option(self): + cf.register_option("a", 1, "doc") + + # can't register an already registered option + msg = "Option 'a' has already been registered" + with pytest.raises(OptionError, match=msg): + cf.register_option("a", 1, "doc") + + # can't register an already registered option + msg = "Path prefix to option 'a' is already an option" + with pytest.raises(OptionError, match=msg): + cf.register_option("a.b.c.d1", 1, "doc") + with pytest.raises(OptionError, match=msg): + cf.register_option("a.b.c.d2", 1, "doc") + + # no python keywords + msg = "for is a python keyword" + with pytest.raises(ValueError, match=msg): + cf.register_option("for", 0) + with pytest.raises(ValueError, match=msg): + cf.register_option("a.for.b", 0) + # must be valid identifier (ensure attribute access works) + msg = "oh my goddess! is not a valid identifier" + with pytest.raises(ValueError, match=msg): + cf.register_option("Oh my Goddess!", 0) + + # we can register options several levels deep + # without predefining the intermediate steps + # and we can define differently named options + # in the same namespace + cf.register_option("k.b.c.d1", 1, "doc") + cf.register_option("k.b.c.d2", 1, "doc") + + def test_describe_option(self): + cf.register_option("a", 1, "doc") + cf.register_option("b", 1, "doc2") + cf.deprecate_option("b") + + cf.register_option("c.d.e1", 1, "doc3") + cf.register_option("c.d.e2", 1, "doc4") + cf.register_option("f", 1) + cf.register_option("g.h", 1) + cf.register_option("k", 2) + cf.deprecate_option("g.h", rkey="k") + cf.register_option("l", "foo") + + # non-existent keys raise KeyError + msg = r"No such keys\(s\)" + with pytest.raises(OptionError, match=msg): + cf.describe_option("no.such.key") + + # we can get the description for any key we registered + assert "doc" in cf.describe_option("a", _print_desc=False) + assert "doc2" in cf.describe_option("b", _print_desc=False) + assert "precated" in cf.describe_option("b", _print_desc=False) + assert "doc3" in cf.describe_option("c.d.e1", _print_desc=False) + assert "doc4" in cf.describe_option("c.d.e2", _print_desc=False) + + # if no doc is specified we get a default message + # saying "description not available" + assert "available" in cf.describe_option("f", _print_desc=False) + assert "available" in cf.describe_option("g.h", _print_desc=False) + assert "precated" in cf.describe_option("g.h", _print_desc=False) + assert "k" in cf.describe_option("g.h", _print_desc=False) + + # default is reported + assert "foo" in cf.describe_option("l", _print_desc=False) + # current value is reported + assert "bar" not in cf.describe_option("l", _print_desc=False) + cf.set_option("l", "bar") + assert "bar" in cf.describe_option("l", _print_desc=False) + + def test_case_insensitive(self): + cf.register_option("KanBAN", 1, "doc") + + assert "doc" in cf.describe_option("kanbaN", _print_desc=False) + assert cf.get_option("kanBaN") == 1 + cf.set_option("KanBan", 2) + assert cf.get_option("kAnBaN") == 2 + + # gets of non-existent keys fail + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + cf.get_option("no_such_option") + cf.deprecate_option("KanBan") + + assert cf._is_deprecated("kAnBaN") + + def test_get_option(self): + cf.register_option("a", 1, "doc") + cf.register_option("b.c", "hullo", "doc2") + cf.register_option("b.b", None, "doc2") + + # gets of existing keys succeed + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + assert cf.get_option("b.b") is None + + # gets of non-existent keys fail + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + cf.get_option("no_such_option") + + def test_set_option(self): + cf.register_option("a", 1, "doc") + cf.register_option("b.c", "hullo", "doc2") + cf.register_option("b.b", None, "doc2") + + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + assert cf.get_option("b.b") is None + + cf.set_option("a", 2) + cf.set_option("b.c", "wurld") + cf.set_option("b.b", 1.1) + + assert cf.get_option("a") == 2 + assert cf.get_option("b.c") == "wurld" + assert cf.get_option("b.b") == 1.1 + + msg = r"No such keys\(s\): 'no.such.key'" + with pytest.raises(OptionError, match=msg): + cf.set_option("no.such.key", None) + + def test_set_option_empty_args(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + cf.set_option() + + def test_set_option_uneven_args(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + cf.set_option("a.b", 2, "b.c") + + def test_set_option_invalid_single_argument_type(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + cf.set_option(2) + + def test_set_option_multiple(self): + cf.register_option("a", 1, "doc") + cf.register_option("b.c", "hullo", "doc2") + cf.register_option("b.b", None, "doc2") + + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + assert cf.get_option("b.b") is None + + cf.set_option("a", "2", "b.c", None, "b.b", 10.0) + + assert cf.get_option("a") == "2" + assert cf.get_option("b.c") is None + assert cf.get_option("b.b") == 10.0 + + def test_validation(self): + cf.register_option("a", 1, "doc", validator=cf.is_int) + cf.register_option("d", 1, "doc", validator=cf.is_nonnegative_int) + cf.register_option("b.c", "hullo", "doc2", validator=cf.is_text) + + msg = "Value must have type ''" + with pytest.raises(ValueError, match=msg): + cf.register_option("a.b.c.d2", "NO", "doc", validator=cf.is_int) + + cf.set_option("a", 2) # int is_int + cf.set_option("b.c", "wurld") # str is_str + cf.set_option("d", 2) + cf.set_option("d", None) # non-negative int can be None + + # None not is_int + with pytest.raises(ValueError, match=msg): + cf.set_option("a", None) + with pytest.raises(ValueError, match=msg): + cf.set_option("a", "ab") + + msg = "Value must be a nonnegative integer or None" + with pytest.raises(ValueError, match=msg): + cf.register_option("a.b.c.d3", "NO", "doc", validator=cf.is_nonnegative_int) + with pytest.raises(ValueError, match=msg): + cf.register_option("a.b.c.d3", -2, "doc", validator=cf.is_nonnegative_int) + + msg = r"Value must be an instance of \|" + with pytest.raises(ValueError, match=msg): + cf.set_option("b.c", 1) + + validator = cf.is_one_of_factory([None, cf.is_callable]) + cf.register_option("b", lambda: None, "doc", validator=validator) + # pylint: disable-next=consider-using-f-string + cf.set_option("b", "%.1f".format) # Formatter is callable + cf.set_option("b", None) # Formatter is none (default) + with pytest.raises(ValueError, match="Value must be a callable"): + cf.set_option("b", "%.1f") + + def test_reset_option(self): + cf.register_option("a", 1, "doc", validator=cf.is_int) + cf.register_option("b.c", "hullo", "doc2", validator=cf.is_str) + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + + cf.set_option("a", 2) + cf.set_option("b.c", "wurld") + assert cf.get_option("a") == 2 + assert cf.get_option("b.c") == "wurld" + + cf.reset_option("a") + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "wurld" + cf.reset_option("b.c") + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + + def test_reset_option_all(self): + cf.register_option("a", 1, "doc", validator=cf.is_int) + cf.register_option("b.c", "hullo", "doc2", validator=cf.is_str) + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + + cf.set_option("a", 2) + cf.set_option("b.c", "wurld") + assert cf.get_option("a") == 2 + assert cf.get_option("b.c") == "wurld" + + cf.reset_option("all") + assert cf.get_option("a") == 1 + assert cf.get_option("b.c") == "hullo" + + def test_deprecate_option(self): + # we can deprecate non-existent options + cf.deprecate_option("foo") + + assert cf._is_deprecated("foo") + with tm.assert_produces_warning(FutureWarning, match="deprecated"): + with pytest.raises(KeyError, match="No such keys.s.: 'foo'"): + cf.get_option("foo") + + cf.register_option("a", 1, "doc", validator=cf.is_int) + cf.register_option("b.c", "hullo", "doc2") + cf.register_option("foo", "hullo", "doc2") + + cf.deprecate_option("a", removal_ver="nifty_ver") + with tm.assert_produces_warning(FutureWarning, match="eprecated.*nifty_ver"): + cf.get_option("a") + + msg = "Option 'a' has already been defined as deprecated" + with pytest.raises(OptionError, match=msg): + cf.deprecate_option("a") + + cf.deprecate_option("b.c", "zounds!") + with tm.assert_produces_warning(FutureWarning, match="zounds!"): + cf.get_option("b.c") + + # test rerouting keys + cf.register_option("d.a", "foo", "doc2") + cf.register_option("d.dep", "bar", "doc2") + assert cf.get_option("d.a") == "foo" + assert cf.get_option("d.dep") == "bar" + + cf.deprecate_option("d.dep", rkey="d.a") # reroute d.dep to d.a + with tm.assert_produces_warning(FutureWarning, match="eprecated"): + assert cf.get_option("d.dep") == "foo" + + with tm.assert_produces_warning(FutureWarning, match="eprecated"): + cf.set_option("d.dep", "baz") # should overwrite "d.a" + + with tm.assert_produces_warning(FutureWarning, match="eprecated"): + assert cf.get_option("d.dep") == "baz" + + def test_config_prefix(self): + with cf.config_prefix("base"): + cf.register_option("a", 1, "doc1") + cf.register_option("b", 2, "doc2") + assert cf.get_option("a") == 1 + assert cf.get_option("b") == 2 + + cf.set_option("a", 3) + cf.set_option("b", 4) + assert cf.get_option("a") == 3 + assert cf.get_option("b") == 4 + + assert cf.get_option("base.a") == 3 + assert cf.get_option("base.b") == 4 + assert "doc1" in cf.describe_option("base.a", _print_desc=False) + assert "doc2" in cf.describe_option("base.b", _print_desc=False) + + cf.reset_option("base.a") + cf.reset_option("base.b") + + with cf.config_prefix("base"): + assert cf.get_option("a") == 1 + assert cf.get_option("b") == 2 + + def test_callback(self): + k = [None] + v = [None] + + def callback(key): + k.append(key) + v.append(cf.get_option(key)) + + cf.register_option("d.a", "foo", cb=callback) + cf.register_option("d.b", "foo", cb=callback) + + del k[-1], v[-1] + cf.set_option("d.a", "fooz") + assert k[-1] == "d.a" + assert v[-1] == "fooz" + + del k[-1], v[-1] + cf.set_option("d.b", "boo") + assert k[-1] == "d.b" + assert v[-1] == "boo" + + del k[-1], v[-1] + cf.reset_option("d.b") + assert k[-1] == "d.b" + + def test_set_ContextManager(self): + def eq(val): + assert cf.get_option("a") == val + + cf.register_option("a", 0) + eq(0) + with cf.option_context("a", 15): + eq(15) + with cf.option_context("a", 25): + eq(25) + eq(15) + eq(0) + + cf.set_option("a", 17) + eq(17) + + # Test that option_context can be used as a decorator too (#34253). + @cf.option_context("a", 123) + def f(): + eq(123) + + f() + + def test_attribute_access(self): + holder = [] + + def f3(key): + holder.append(True) + + cf.register_option("a", 0) + cf.register_option("c", 0, cb=f3) + options = cf.options + + assert options.a == 0 + with cf.option_context("a", 15): + assert options.a == 15 + + options.a = 500 + assert cf.get_option("a") == 500 + + cf.reset_option("a") + assert options.a == cf.get_option("a", 0) + + msg = "You can only set the value of existing options" + with pytest.raises(OptionError, match=msg): + options.b = 1 + with pytest.raises(OptionError, match=msg): + options.display = 1 + + # make sure callback kicks when using this form of setting + options.c = 1 + assert len(holder) == 1 + + def test_option_context_scope(self): + # Ensure that creating a context does not affect the existing + # environment as it is supposed to be used with the `with` statement. + # See https://github.com/pandas-dev/pandas/issues/8514 + + original_value = 60 + context_value = 10 + option_name = "a" + + cf.register_option(option_name, original_value) + + # Ensure creating contexts didn't affect the current context. + ctx = cf.option_context(option_name, context_value) + assert cf.get_option(option_name) == original_value + + # Ensure the correct value is available inside the context. + with ctx: + assert cf.get_option(option_name) == context_value + + # Ensure the current context is reset + assert cf.get_option(option_name) == original_value + + def test_dictwrapper_getattr(self): + options = cf.options + # GH 19789 + with pytest.raises(OptionError, match="No such option"): + options.bananas + assert not hasattr(options, "bananas") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_localization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_localization.py new file mode 100644 index 0000000000000000000000000000000000000000..3907f557d1075536e46d12f219dc9b0c3f3f32c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/config/test_localization.py @@ -0,0 +1,156 @@ +import codecs +import locale +import os + +import pytest + +from pandas._config.localization import ( + can_set_locale, + get_locales, + set_locale, +) + +from pandas.compat import ISMUSL + +import pandas as pd + +_all_locales = get_locales() +_current_locale = locale.setlocale(locale.LC_ALL) # getlocale() is wrong, see GH#46595 + +# Don't run any of these tests if we have no locales. +pytestmark = pytest.mark.skipif(not _all_locales, reason="Need locales") + +_skip_if_only_one_locale = pytest.mark.skipif( + len(_all_locales) <= 1, reason="Need multiple locales for meaningful test" +) + + +def _get_current_locale(lc_var: int = locale.LC_ALL) -> str: + # getlocale is not always compliant with setlocale, use setlocale. GH#46595 + return locale.setlocale(lc_var) + + +@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME)) +def test_can_set_current_locale(lc_var): + # Can set the current locale + before_locale = _get_current_locale(lc_var) + assert can_set_locale(before_locale, lc_var=lc_var) + after_locale = _get_current_locale(lc_var) + assert before_locale == after_locale + + +@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME)) +def test_can_set_locale_valid_set(lc_var): + # Can set the default locale. + before_locale = _get_current_locale(lc_var) + assert can_set_locale("", lc_var=lc_var) + after_locale = _get_current_locale(lc_var) + assert before_locale == after_locale + + +@pytest.mark.parametrize( + "lc_var", + ( + locale.LC_ALL, + locale.LC_CTYPE, + pytest.param( + locale.LC_TIME, + marks=pytest.mark.skipif( + ISMUSL, reason="MUSL allows setting invalid LC_TIME." + ), + ), + ), +) +def test_can_set_locale_invalid_set(lc_var): + # Cannot set an invalid locale. + before_locale = _get_current_locale(lc_var) + assert not can_set_locale("non-existent_locale", lc_var=lc_var) + after_locale = _get_current_locale(lc_var) + assert before_locale == after_locale + + +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME)) +def test_can_set_locale_no_leak(lang, enc, lc_var): + # Test that can_set_locale does not leak even when returning False. See GH#46595 + before_locale = _get_current_locale(lc_var) + can_set_locale((lang, enc), locale.LC_ALL) + after_locale = _get_current_locale(lc_var) + assert before_locale == after_locale + + +def test_can_set_locale_invalid_get(monkeypatch): + # see GH#22129 + # In some cases, an invalid locale can be set, + # but a subsequent getlocale() raises a ValueError. + + def mock_get_locale(): + raise ValueError() + + with monkeypatch.context() as m: + m.setattr(locale, "getlocale", mock_get_locale) + assert not can_set_locale("") + + +def test_get_locales_at_least_one(): + # see GH#9744 + assert len(_all_locales) > 0 + + +@_skip_if_only_one_locale +def test_get_locales_prefix(): + first_locale = _all_locales[0] + assert len(get_locales(prefix=first_locale[:2])) > 0 + + +@_skip_if_only_one_locale +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): + before_locale = _get_current_locale() + + enc = codecs.lookup(enc).name + new_locale = lang, enc + + if not can_set_locale(new_locale): + msg = "unsupported locale setting" + + with pytest.raises(locale.Error, match=msg): + with set_locale(new_locale): + pass + else: + with set_locale(new_locale) as normalized_locale: + new_lang, new_enc = normalized_locale.split(".") + new_enc = codecs.lookup(enc).name + + normalized_locale = new_lang, new_enc + assert normalized_locale == new_locale + + # Once we exit the "with" statement, locale should be back to what it was. + after_locale = _get_current_locale() + assert before_locale == after_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..092ccb8957532de069f652018f04ee582c427e86 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/test_extract_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/test_extract_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f18645245bdca917c097b6ca0a1b6013555a7980 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/construction/__pycache__/test_extract_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0d3eb79a0a8855ae076d0e4dac8577b2a4dc0b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c60846ed34a2b829a8eee1f540d0a739ad764e22 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d41d60f9aec57efa169e4f77e70eb8b9c3d6e489 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_chained_assignment_deprecation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_chained_assignment_deprecation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..646d6df10dfbdc24d86a6eb1b52f930fa1fd5788 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_chained_assignment_deprecation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91ac01b72ffeca02616c37f529a2aca817fa384b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b77d694b4714af7029cebd64018df467501e7e09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_core_functionalities.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_core_functionalities.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b51be60ad9060c80c07c8886c0d5d28e6d79504 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_core_functionalities.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_functions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_functions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5136d6b52cdaf40ac17369475e3ff65abbfaa332 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_functions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bb1561de7d88d4a5951a63b8c49cc98f42bd88d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_internals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_internals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..febe7cf4ea23dfba2c12bf3ce544aa7cf59cdb97 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_internals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_interp_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_interp_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cd0d69d6e6a38627ef3de2137df3d302b4935c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_interp_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_replace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_replace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dee1f62a3fcbb7597ac73649df18cec8b4b3a3a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_replace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_setitem.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_setitem.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..702413372fd39b16d0c442ff08d5ad1003c7a348 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_setitem.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_util.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05b1768457f670af58cc1dd6596182fc4acb1b8d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/test_util.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/util.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..162ab37963ffce33362d082377e2702fdc920919 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__pycache__/util.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7e9ce01e1b0dd058ecdf3464f5945ded2425953 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_datetimeindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_datetimeindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20cd5b3a630c9fc4726ba47bd0497aaff280c5f8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_datetimeindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1be26d45e6d8ba7b37313d2e04baf135964d02ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_periodindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_periodindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b3a960c363d6e51b36a0e671bce50e07e20d42e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_periodindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_timedeltaindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_timedeltaindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2afa967aec1edb3c333864918eea233414313c70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/__pycache__/test_timedeltaindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py new file mode 100644 index 0000000000000000000000000000000000000000..b023297c9549d88f6e1c493e50f148a74f26cea6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py @@ -0,0 +1,69 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: DatetimeIndex(x), + lambda x: DatetimeIndex(DatetimeIndex(x)), + ], +) +def test_datetimeindex(using_copy_on_write, cons): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_tz_convert(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin") + ser = Series(dt) + idx = DatetimeIndex(ser).tz_convert("US/Eastern") + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_tz_localize(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = DatetimeIndex(ser).tz_localize("Europe/Berlin") + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_isocalendar(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + df = DatetimeIndex(ser).isocalendar() + expected = df.index.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + + +def test_index_values(using_copy_on_write): + idx = date_range("2019-12-31", periods=3, freq="D") + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_index.py new file mode 100644 index 0000000000000000000000000000000000000000..49d756cf32d34306fbb4eb3525f1c5b70d5f155c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_index.py @@ -0,0 +1,184 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def index_view(index_data=[1, 2]): + df = DataFrame({"a": index_data, "b": 1.5}) + view = df[:] + df = df.set_index("a", drop=True) + idx = df.index + # df = None + return idx, view + + +def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1}) + df = df.set_index("a", drop=False) + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 2], name="a")) + + +def test_set_index_drop_update_column(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + view = df[:] + df = df.set_index("a", drop=True) + expected = df.index.copy(deep=True) + view.iloc[0, 0] = 100 + tm.assert_index_equal(df.index, expected) + + +def test_set_index_series(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + df = df.set_index(ser) + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + df.index = ser + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + rhs_index = Index(ser) + df.index = rhs_index + rhs_index = None # overwrite to clear reference + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_index_from_series(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser) + expected = idx.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + else: + tm.assert_index_equal(idx, Index([100, 2])) + + +def test_index_from_series_copy(using_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser, copy=True) # noqa: F841 + arr = get_array(ser) + ser.iloc[0] = 100 + assert np.shares_memory(get_array(ser), arr) + + +def test_index_from_index(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser) + idx = Index(idx) + expected = idx.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + else: + tm.assert_index_equal(idx, Index([100, 2])) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x._shallow_copy(x._values), + lambda x: x.view(), + lambda x: x.take([0, 1]), + lambda x: x.repeat([1, 1]), + lambda x: x[slice(0, 2)], + lambda x: x[[0, 1]], + lambda x: x._getitem_slice(slice(0, 2)), + lambda x: x.delete([]), + lambda x: x.rename("b"), + lambda x: x.astype("Int64", copy=False), + ], + ids=[ + "_shallow_copy", + "view", + "take", + "repeat", + "getitem_slice", + "getitem_list", + "_getitem_slice", + "delete", + "rename", + "astype", + ], +) +def test_index_ops(using_copy_on_write, func, request): + idx, view_ = index_view() + expected = idx.copy(deep=True) + if "astype" in request.node.callspec.id: + expected = expected.astype("Int64") + idx = func(idx) + view_.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected, check_names=False) + + +def test_infer_objects(using_copy_on_write): + idx, view_ = index_view(["a", "b"]) + expected = idx.copy(deep=True) + idx = idx.infer_objects(copy=False) + view_.iloc[0, 0] = "aaaa" + if using_copy_on_write: + tm.assert_index_equal(idx, expected, check_names=False) + + +def test_index_to_frame(using_copy_on_write): + idx = Index([1, 2, 3], name="a") + expected = idx.copy(deep=True) + df = idx.to_frame() + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), idx._values) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df, "a"), idx._values) + + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) + + +def test_index_values(using_copy_on_write): + idx = Index([1, 2, 3]) + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_periodindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_periodindex.py new file mode 100644 index 0000000000000000000000000000000000000000..b80ce1d3d838fc0f517089d452221ac19363a9b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_periodindex.py @@ -0,0 +1,30 @@ +import pytest + +from pandas import ( + Period, + PeriodIndex, + Series, + period_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: PeriodIndex(x), + lambda x: PeriodIndex(PeriodIndex(x)), + ], +) +def test_periodindex(using_copy_on_write, cons): + dt = period_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Period("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9832093fded0f48c523bdbc363d043a871eb60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py @@ -0,0 +1,30 @@ +import pytest + +from pandas import ( + Series, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: TimedeltaIndex(x), + lambda x: TimedeltaIndex(TimedeltaIndex(x)), + ], +) +def test_timedeltaindex(using_copy_on_write, cons): + dt = timedelta_range("1 day", periods=3) + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Timedelta("5 days") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_chained_assignment_deprecation.py new file mode 100644 index 0000000000000000000000000000000000000000..0a37f6b813e55d6072506a5c8168b050aa79ecda --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -0,0 +1,174 @@ +import numpy as np +import pytest + +from pandas.compat import PY311 +from pandas.errors import ( + ChainedAssignmentError, + SettingWithCopyWarning, +) + +from pandas import ( + DataFrame, + option_context, +) +import pandas._testing as tm + + +def test_methods_iloc_warn(using_copy_on_write): + if not using_copy_on_write: + df = DataFrame({"a": [1, 2, 3], "b": 1}) + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].replace(1, 5, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].fillna(1, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].interpolate(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].ffill(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].bfill(inplace=True) + + +@pytest.mark.parametrize( + "func, args", + [ + ("replace", (4, 5)), + ("fillna", (1,)), + ("interpolate", ()), + ("bfill", ()), + ("ffill", ()), + ], +) +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + getattr(ser, func)(*args, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + # TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+ + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + with tm.assert_cow_warning(not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + # ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + even in warning mode this doesn't trigger + # the warning of Py3.1+ (see above) + with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + +def test_methods_iloc_getitem_item_cache_fillna( + using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + ser.fillna(1, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + with tm.assert_cow_warning(match="A value"): + df["a"].fillna(1, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + # TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + with tm.assert_cow_warning(warn_copy_on_write, match="A value"): + df["a"].fillna(1, inplace=True) + + +# TODO(CoW-warn) expand the cases +@pytest.mark.parametrize( + "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])] +) +def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write): + # ensure we only get a single warning for those typical cases of chained + # assignment + df = DataFrame({"a": [1, 2, 3], "b": 1}) + + # using custom check instead of tm.assert_produces_warning because that doesn't + # fail if multiple warnings are raised + with pytest.warns() as record: + df["a"][indexer] = 0 + assert len(record) == 1 + if using_copy_on_write: + assert record[0].category == ChainedAssignmentError + else: + assert record[0].category == FutureWarning + assert "ChainedAssignmentError" in record[0].message.args[0] + + +@pytest.mark.filterwarnings("ignore::pandas.errors.SettingWithCopyWarning") +@pytest.mark.parametrize( + "indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])] +) +def test_frame_setitem(indexer, using_copy_on_write): + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1}) + + extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,) + + with option_context("chained_assignment", "warn"): + with tm.raises_chained_assignment_error(extra_warnings=extra_warnings): + df[0:3][indexer] = 10 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_functions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..56e4b186350f2719978d6ca3803154033c8e08af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_functions.py @@ -0,0 +1,396 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, + concat, + merge, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_concat_frames(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + df_orig = df.copy() + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + result.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_concat_frames_updating_input(using_copy_on_write): + df = DataFrame({"b": ["a"] * 3}) + df2 = DataFrame({"a": ["a"] * 3}) + result = concat([df, df2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + expected = result.copy() + df.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + + df2.iloc[0, 0] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + tm.assert_frame_equal(result, expected) + + +def test_concat_series(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + ser_orig = ser.copy() + ser2_orig = ser2.copy() + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + else: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert not np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) + + result.iloc[0, 1] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), ser2.values) + tm.assert_series_equal(ser, ser_orig) + tm.assert_series_equal(ser2, ser2_orig) + + +def test_concat_frames_chained(using_copy_on_write): + df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df2 = DataFrame({"c": [4, 5, 6]}) + df3 = DataFrame({"d": [4, 5, 6]}) + result = concat([concat([df1, df2], axis=1), df3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + + df1.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + + tm.assert_frame_equal(result, expected) + + +def test_concat_series_chained(using_copy_on_write): + ser1 = Series([1, 2, 3], name="a") + ser2 = Series([4, 5, 6], name="c") + ser3 = Series([4, 5, 6], name="d") + result = concat([concat([ser1, ser2], axis=1), ser3], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + + ser1.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + + tm.assert_frame_equal(result, expected) + + +def test_concat_series_updating_input(using_copy_on_write): + ser = Series([1, 2], name="a") + ser2 = Series([3, 4], name="b") + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = concat([ser, ser2], axis=1) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) + + ser2.iloc[0] = 1000 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + tm.assert_frame_equal(result, expected) + + +def test_concat_mixed_series_frame(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "c": 1}) + ser = Series([4, 5, 6], name="d") + result = concat([df, ser], axis=1) + expected = result.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + ser.iloc[0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + + df.iloc[0, 0] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("copy", [True, None, False]) +def test_concat_copy_keyword(using_copy_on_write, copy): + df = DataFrame({"a": [1, 2]}) + df2 = DataFrame({"b": [1.5, 2.5]}) + + result = concat([df, df2], axis=1, copy=copy) + + if using_copy_on_write or copy is False: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + + +@pytest.mark.parametrize( + "func", + [ + lambda df1, df2, **kwargs: df1.merge(df2, **kwargs), + lambda df1, df2, **kwargs: merge(df1, df2, **kwargs), + ], +) +def test_merge_on_key(using_copy_on_write, func): + df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) + df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df1_orig = df1.copy() + df2_orig = df2.copy() + + result = func(df1, df2, on="key") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) + assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 2] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + tm.assert_frame_equal(df1, df1_orig) + tm.assert_frame_equal(df2, df2_orig) + + +def test_merge_on_index(using_copy_on_write): + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"b": [4, 5, 6]}) + df1_orig = df1.copy() + df2_orig = df2.copy() + + result = merge(df1, df2, left_index=True, right_index=True) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + tm.assert_frame_equal(df1, df1_orig) + tm.assert_frame_equal(df2, df2_orig) + + +@pytest.mark.parametrize( + "func, how", + [ + (lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"), + (lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"), + ], +) +def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): + df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) + df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]}) + df1_orig = df1.copy() + df2_orig = df2.copy() + + result = func(df1, df2, how=how) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df2._mgr._has_no_reference(0) + assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is ( + how == "left" + ) + assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + if how == "left": + result.iloc[0, 1] = 0 + else: + result.iloc[0, 2] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + tm.assert_frame_equal(df1, df1_orig) + tm.assert_frame_equal(df2, df2_orig) + + +@pytest.mark.parametrize("copy", [True, None, False]) +def test_merge_copy_keyword(using_copy_on_write, copy): + df = DataFrame({"a": [1, 2]}) + df2 = DataFrame({"b": [3, 4.5]}) + + result = df.merge(df2, copy=copy, left_index=True, right_index=True) + + if using_copy_on_write or copy is False: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + + +def test_join_on_key(using_copy_on_write): + df_index = Index(["a", "b", "c"], name="key") + + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) + df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) + + df1_orig = df1.copy() + df2_orig = df2.copy() + + result = df1.join(df2, on="key") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory(get_array(result.index), get_array(df2.index)) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + tm.assert_frame_equal(df1, df1_orig) + tm.assert_frame_equal(df2, df2_orig) + + +def test_join_multiple_dataframes_on_key(using_copy_on_write): + df_index = Index(["a", "b", "c"], name="key") + + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) + dfs_list = [ + DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)), + DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)), + ] + + df1_orig = df1.copy() + dfs_list_orig = [df.copy() for df in dfs_list] + + result = df1.join(dfs_list) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory( + get_array(result.index), get_array(dfs_list[0].index) + ) + assert not np.shares_memory( + get_array(result.index), get_array(dfs_list[1].index) + ) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 2] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + tm.assert_frame_equal(df1, df1_orig) + for df, df_orig in zip(dfs_list, dfs_list_orig): + tm.assert_frame_equal(df, df_orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_setitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_setitem.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3b939734534520f0cf7051dbc72989d0caf990 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_setitem.py @@ -0,0 +1,156 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for the values that are set in a DataFrame + + +def test_set_column_with_array(): + # Case: setting an array as a new column (df[col] = arr) copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array([1, 2, 3], dtype="int64") + + df["c"] = arr + + # the array data is copied + assert not np.shares_memory(get_array(df, "c"), arr) + # and thus modifying the array does not modify the DataFrame + arr[0] = 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + + +def test_set_column_with_series(using_copy_on_write): + # Case: setting a series as a new column (df[col] = s) copies that data + # (with delayed copy with CoW) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = Series([1, 2, 3]) + + df["c"] = ser + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "c"), get_array(ser)) + else: + # the series data is copied + assert not np.shares_memory(get_array(df, "c"), get_array(ser)) + + # and modifying the series does not modify the DataFrame + ser.iloc[0] = 0 + assert ser.iloc[0] == 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + + +def test_set_column_with_index(using_copy_on_write): + # Case: setting an index as a new column (df[col] = idx) copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + idx = Index([1, 2, 3]) + + df["c"] = idx + + # the index data is copied + assert not np.shares_memory(get_array(df, "c"), idx.values) + + idx = RangeIndex(1, 4) + arr = idx.values + + df["d"] = idx + + assert not np.shares_memory(get_array(df, "d"), arr) + + +def test_set_columns_with_dataframe(using_copy_on_write): + # Case: setting a DataFrame as new columns copies that data + # (with delayed copy with CoW) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) + + df[["c", "d"]] = df2 + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + else: + # the data is copied + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + + # and modifying the set DataFrame does not modify the original DataFrame + df2.iloc[0, 0] = 0 + tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c")) + + +def test_setitem_series_no_copy(using_copy_on_write): + # Case: setting a Series as column into a DataFrame can delay copying that data + df = DataFrame({"a": [1, 2, 3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + # adding a new column + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_single_block(using_copy_on_write): + # Overwriting an existing column that is a single block + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + df["a"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "a")) + + df.iloc[0, 0] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_split_block(using_copy_on_write): + # Overwriting an existing column that is part of a larger block + df = DataFrame({"a": [1, 2, 3], "b": 1}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_column_midx_broadcasting(using_copy_on_write): + # Setting a Series to multiple columns will repeat the data + # (currently copying the data eagerly) + df = DataFrame( + [[1, 2, 3], [3, 4, 5]], + columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]), + ) + rhs = Series([10, 11]) + df["a"] = rhs + assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + # this should not raise any warning + with tm.assert_produces_warning(None): + df["a"] += 1 + + # when it is not in a chain, then it should produce a warning + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = df["a"] + with tm.assert_cow_warning(warn_copy_on_write): + ser += 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py new file mode 100644 index 0000000000000000000000000000000000000000..ab468c81124bc52d18818cdb1935827a84277ba0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.construction import sanitize_array + + +@pytest.mark.parametrize( + "values, dtype, expected", + [ + ([1, 2, 3], None, np.array([1, 2, 3], dtype=np.int64)), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (["1", "2", None], None, np.array(["1", "2", None])), + (["1", "2", None], np.dtype("str"), np.array(["1", "2", None])), + ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), + ], +) +def test_construct_1d_ndarray_preserving_na( + values, dtype, expected, using_infer_string +): + result = sanitize_array(values, index=None, dtype=dtype) + if using_infer_string and expected.dtype == object and dtype is None: + tm.assert_extension_array_equal(result, pd.array(expected)) + else: + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) +def test_construct_1d_ndarray_preserving_na_datetimelike(dtype): + arr = np.arange(5, dtype=np.int64).view(dtype) + expected = np.array(list(arr), dtype=object) + assert all(isinstance(x, type(arr[0])) for x in expected) + + result = sanitize_array(arr, index=None, dtype=np.dtype(object)) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py new file mode 100644 index 0000000000000000000000000000000000000000..cb44f91f34dec80c090d3ce3fc9a2dbd4578bb57 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py @@ -0,0 +1,20 @@ +import pytest + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + + +@pytest.mark.parametrize("datum1", [1, 2.0, "3", (4, 5), [6, 7], None]) +@pytest.mark.parametrize("datum2", [8, 9.0, "10", (11, 12), [13, 14], None]) +def test_cast_1d_array(datum1, datum2): + data = [datum1, datum2] + result = construct_1d_object_array_from_listlike(data) + + # Direct comparison fails: https://github.com/numpy/numpy/issues/10218 + assert result.dtype == "object" + assert list(result) == data + + +@pytest.mark.parametrize("val", [1, 2.0, None]) +def test_cast_1d_array_invalid_scalar(val): + with pytest.raises(TypeError, match="has no len()"): + construct_1d_object_array_from_listlike(val) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_dict_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..13dc82d779f953fbea54323785bdcadc3e24dfd8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py new file mode 100644 index 0000000000000000000000000000000000000000..83ef7382fbe8a27ad96511a3675c51b9eadc2331 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py @@ -0,0 +1,175 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) + +from pandas import ( + Categorical, + Index, +) + + +@pytest.mark.parametrize( + "source_dtypes,expected_common_dtype", + [ + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((object,), object), + # Into ints. + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + # Into floats. + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + # Into others. + ((np.complex128, np.int32), np.complex128), + ((object, np.float32), object), + ((object, np.int16), object), + # Bool with int. + ((np.dtype("bool"), np.int64), object), + ((np.dtype("bool"), np.int32), object), + ((np.dtype("bool"), np.int16), object), + ((np.dtype("bool"), np.int8), object), + ((np.dtype("bool"), np.uint64), object), + ((np.dtype("bool"), np.uint32), object), + ((np.dtype("bool"), np.uint16), object), + ((np.dtype("bool"), np.uint8), object), + # Bool with float. + ((np.dtype("bool"), np.float64), object), + ((np.dtype("bool"), np.float32), object), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), object), + ((np.dtype("datetime64[ns]"), np.int64), object), + ], +) +def test_numpy_dtypes(source_dtypes, expected_common_dtype): + source_dtypes = [pandas_dtype(x) for x in source_dtypes] + assert find_common_type(source_dtypes) == expected_common_dtype + + +def test_raises_empty_input(): + with pytest.raises(ValueError, match="no types given"): + find_common_type([]) + + +@pytest.mark.parametrize( + "dtypes,exp_type", + [ + ([CategoricalDtype()], "category"), + ([object, CategoricalDtype()], object), + ([CategoricalDtype(), CategoricalDtype()], "category"), + ], +) +def test_categorical_dtype(dtypes, exp_type): + assert find_common_type(dtypes) == exp_type + + +def test_datetimetz_dtype_match(): + dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") + assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]" + + +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + np.dtype("datetime64[ns]"), + object, + np.int64, + ], +) +def test_datetimetz_dtype_mismatch(dtype2): + dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") + assert find_common_type([dtype, dtype2]) == object + assert find_common_type([dtype2, dtype]) == object + + +def test_period_dtype_match(): + dtype = PeriodDtype(freq="D") + assert find_common_type([dtype, dtype]) == "period[D]" + + +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + PeriodDtype(freq="2D"), + PeriodDtype(freq="h"), + np.dtype("datetime64[ns]"), + object, + np.int64, + ], +) +def test_period_dtype_mismatch(dtype2): + dtype = PeriodDtype(freq="D") + assert find_common_type([dtype, dtype2]) == object + assert find_common_type([dtype2, dtype]) == object + + +interval_dtypes = [ + IntervalDtype(np.int64, "right"), + IntervalDtype(np.float64, "right"), + IntervalDtype(np.uint64, "right"), + IntervalDtype(DatetimeTZDtype(unit="ns", tz="US/Eastern"), "right"), + IntervalDtype("M8[ns]", "right"), + IntervalDtype("m8[ns]", "right"), +] + + +@pytest.mark.parametrize("left", interval_dtypes) +@pytest.mark.parametrize("right", interval_dtypes) +def test_interval_dtype(left, right): + result = find_common_type([left, right]) + + if left is right: + assert result is left + + elif left.subtype.kind in ["i", "u", "f"]: + # i.e. numeric + if right.subtype.kind in ["i", "u", "f"]: + # both numeric -> common numeric subtype + expected = IntervalDtype(np.float64, "right") + assert result == expected + else: + assert result == object + + else: + assert result == object + + +@pytest.mark.parametrize("dtype", interval_dtypes) +def test_interval_dtype_with_categorical(dtype): + obj = Index([], dtype=dtype) + + cat = Categorical([], categories=obj) + + result = find_common_type([dtype, cat.dtype]) + assert result == dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_maybe_box_native.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_maybe_box_native.py new file mode 100644 index 0000000000000000000000000000000000000000..3f62f31dac2191a15d7df8db028a9286262d0080 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_maybe_box_native.py @@ -0,0 +1,40 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_box_native + +from pandas import ( + Interval, + Period, + Timedelta, + Timestamp, +) + + +@pytest.mark.parametrize( + "obj,expected_dtype", + [ + (b"\x00\x10", bytes), + (int(4), int), + (np.uint(4), int), + (np.int32(-4), int), + (np.uint8(4), int), + (float(454.98), float), + (np.float16(0.4), float), + (np.float64(1.4), float), + (np.bool_(False), bool), + (datetime(2005, 2, 25), datetime), + (np.datetime64("2005-02-25"), Timestamp), + (Timestamp("2005-02-25"), Timestamp), + (np.timedelta64(1, "D"), Timedelta), + (Timedelta(1, "D"), Timedelta), + (Interval(0, 1), Interval), + (Period("4Q2005"), Period), + ], +) +def test_maybe_box_native(obj, expected_dtype): + boxed_obj = maybe_box_native(obj) + result_dtype = type(boxed_obj) + assert result_dtype is expected_dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_promote.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_promote.py new file mode 100644 index 0000000000000000000000000000000000000000..021107724bef73d998191d65b55fb29848fc8b9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_promote.py @@ -0,0 +1,530 @@ +""" +These test the method maybe_promote from core/dtypes/cast.py +""" + +import datetime +from decimal import Decimal + +import numpy as np +import pytest + +from pandas._libs.tslibs import NaT + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna + +import pandas as pd + + +def _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar=None): + """ + Auxiliary function to unify testing of scalar/array promotion. + + Parameters + ---------- + dtype : dtype + The value to pass on as the first argument to maybe_promote. + fill_value : scalar + The value to pass on as the second argument to maybe_promote as + a scalar. + expected_dtype : dtype + The expected dtype returned by maybe_promote (by design this is the + same regardless of whether fill_value was passed as a scalar or in an + array!). + exp_val_for_scalar : scalar + The expected value for the (potentially upcast) fill_value returned by + maybe_promote. + """ + assert is_scalar(fill_value) + + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar + + assert result_dtype == expected_dtype + _assert_match(result_fill_value, expected_fill_value) + + +def _assert_match(result_fill_value, expected_fill_value): + # GH#23982/25425 require the same type in addition to equality/NA-ness + res_type = type(result_fill_value) + ex_type = type(expected_fill_value) + + if hasattr(result_fill_value, "dtype"): + # Compare types in a way that is robust to platform-specific + # idiosyncrasies where e.g. sometimes we get "ulonglong" as an alias + # for "uint64" or "intc" as an alias for "int32" + assert result_fill_value.dtype.kind == expected_fill_value.dtype.kind + assert result_fill_value.dtype.itemsize == expected_fill_value.dtype.itemsize + else: + # On some builds, type comparison fails, e.g. np.int32 != np.int32 + assert res_type == ex_type or res_type.__name__ == ex_type.__name__ + + match_value = result_fill_value == expected_fill_value + if match_value is pd.NA: + match_value = False + + # Note: type check above ensures that we have the _same_ NA value + # for missing values, None == None (which is checked + # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT + match_missing = isna(result_fill_value) and isna(expected_fill_value) + + assert match_value or match_missing + + +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # size 8 + ("int8", 1, "int8"), + ("int8", np.iinfo("int8").max + 1, "int16"), + ("int8", np.iinfo("int16").max + 1, "int32"), + ("int8", np.iinfo("int32").max + 1, "int64"), + ("int8", np.iinfo("int64").max + 1, "object"), + ("int8", -1, "int8"), + ("int8", np.iinfo("int8").min - 1, "int16"), + ("int8", np.iinfo("int16").min - 1, "int32"), + ("int8", np.iinfo("int32").min - 1, "int64"), + ("int8", np.iinfo("int64").min - 1, "object"), + # keep signed-ness as long as possible + ("uint8", 1, "uint8"), + ("uint8", np.iinfo("int8").max + 1, "uint8"), + ("uint8", np.iinfo("uint8").max + 1, "uint16"), + ("uint8", np.iinfo("int16").max + 1, "uint16"), + ("uint8", np.iinfo("uint16").max + 1, "uint32"), + ("uint8", np.iinfo("int32").max + 1, "uint32"), + ("uint8", np.iinfo("uint32").max + 1, "uint64"), + ("uint8", np.iinfo("int64").max + 1, "uint64"), + ("uint8", np.iinfo("uint64").max + 1, "object"), + # max of uint8 cannot be contained in int8 + ("uint8", -1, "int16"), + ("uint8", np.iinfo("int8").min - 1, "int16"), + ("uint8", np.iinfo("int16").min - 1, "int32"), + ("uint8", np.iinfo("int32").min - 1, "int64"), + ("uint8", np.iinfo("int64").min - 1, "object"), + # size 16 + ("int16", 1, "int16"), + ("int16", np.iinfo("int8").max + 1, "int16"), + ("int16", np.iinfo("int16").max + 1, "int32"), + ("int16", np.iinfo("int32").max + 1, "int64"), + ("int16", np.iinfo("int64").max + 1, "object"), + ("int16", -1, "int16"), + ("int16", np.iinfo("int8").min - 1, "int16"), + ("int16", np.iinfo("int16").min - 1, "int32"), + ("int16", np.iinfo("int32").min - 1, "int64"), + ("int16", np.iinfo("int64").min - 1, "object"), + ("uint16", 1, "uint16"), + ("uint16", np.iinfo("int8").max + 1, "uint16"), + ("uint16", np.iinfo("uint8").max + 1, "uint16"), + ("uint16", np.iinfo("int16").max + 1, "uint16"), + ("uint16", np.iinfo("uint16").max + 1, "uint32"), + ("uint16", np.iinfo("int32").max + 1, "uint32"), + ("uint16", np.iinfo("uint32").max + 1, "uint64"), + ("uint16", np.iinfo("int64").max + 1, "uint64"), + ("uint16", np.iinfo("uint64").max + 1, "object"), + ("uint16", -1, "int32"), + ("uint16", np.iinfo("int8").min - 1, "int32"), + ("uint16", np.iinfo("int16").min - 1, "int32"), + ("uint16", np.iinfo("int32").min - 1, "int64"), + ("uint16", np.iinfo("int64").min - 1, "object"), + # size 32 + ("int32", 1, "int32"), + ("int32", np.iinfo("int8").max + 1, "int32"), + ("int32", np.iinfo("int16").max + 1, "int32"), + ("int32", np.iinfo("int32").max + 1, "int64"), + ("int32", np.iinfo("int64").max + 1, "object"), + ("int32", -1, "int32"), + ("int32", np.iinfo("int8").min - 1, "int32"), + ("int32", np.iinfo("int16").min - 1, "int32"), + ("int32", np.iinfo("int32").min - 1, "int64"), + ("int32", np.iinfo("int64").min - 1, "object"), + ("uint32", 1, "uint32"), + ("uint32", np.iinfo("int8").max + 1, "uint32"), + ("uint32", np.iinfo("uint8").max + 1, "uint32"), + ("uint32", np.iinfo("int16").max + 1, "uint32"), + ("uint32", np.iinfo("uint16").max + 1, "uint32"), + ("uint32", np.iinfo("int32").max + 1, "uint32"), + ("uint32", np.iinfo("uint32").max + 1, "uint64"), + ("uint32", np.iinfo("int64").max + 1, "uint64"), + ("uint32", np.iinfo("uint64").max + 1, "object"), + ("uint32", -1, "int64"), + ("uint32", np.iinfo("int8").min - 1, "int64"), + ("uint32", np.iinfo("int16").min - 1, "int64"), + ("uint32", np.iinfo("int32").min - 1, "int64"), + ("uint32", np.iinfo("int64").min - 1, "object"), + # size 64 + ("int64", 1, "int64"), + ("int64", np.iinfo("int8").max + 1, "int64"), + ("int64", np.iinfo("int16").max + 1, "int64"), + ("int64", np.iinfo("int32").max + 1, "int64"), + ("int64", np.iinfo("int64").max + 1, "object"), + ("int64", -1, "int64"), + ("int64", np.iinfo("int8").min - 1, "int64"), + ("int64", np.iinfo("int16").min - 1, "int64"), + ("int64", np.iinfo("int32").min - 1, "int64"), + ("int64", np.iinfo("int64").min - 1, "object"), + ("uint64", 1, "uint64"), + ("uint64", np.iinfo("int8").max + 1, "uint64"), + ("uint64", np.iinfo("uint8").max + 1, "uint64"), + ("uint64", np.iinfo("int16").max + 1, "uint64"), + ("uint64", np.iinfo("uint16").max + 1, "uint64"), + ("uint64", np.iinfo("int32").max + 1, "uint64"), + ("uint64", np.iinfo("uint32").max + 1, "uint64"), + ("uint64", np.iinfo("int64").max + 1, "uint64"), + ("uint64", np.iinfo("uint64").max + 1, "object"), + ("uint64", -1, "object"), + ("uint64", np.iinfo("int8").min - 1, "object"), + ("uint64", np.iinfo("int16").min - 1, "object"), + ("uint64", np.iinfo("int32").min - 1, "object"), + ("uint64", np.iinfo("int64").min - 1, "object"), + ], +) +def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype): + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + # output is not a generic int, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_int_with_float(any_int_numpy_dtype, float_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + fill_dtype = np.dtype(float_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling int with float always upcasts to float64 + expected_dtype = np.float64 + # fill_value can be different float type + exp_val_for_scalar = np.float64(fill_value) + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_float_with_int(float_numpy_dtype, any_int_numpy_dtype): + dtype = np.dtype(float_numpy_dtype) + fill_dtype = np.dtype(any_int_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling float with int always keeps float dtype + # because: np.finfo('float32').max > np.iinfo('uint64').max + expected_dtype = dtype + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # float filled with float + ("float32", 1, "float32"), + ("float32", float(np.finfo("float32").max) * 1.1, "float64"), + ("float64", 1, "float64"), + ("float64", float(np.finfo("float32").max) * 1.1, "float64"), + # complex filled with float + ("complex64", 1, "complex64"), + ("complex64", float(np.finfo("float32").max) * 1.1, "complex128"), + ("complex128", 1, "complex128"), + ("complex128", float(np.finfo("float32").max) * 1.1, "complex128"), + # float filled with complex + ("float32", 1 + 1j, "complex64"), + ("float32", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), + ("float64", 1 + 1j, "complex128"), + ("float64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), + # complex filled with complex + ("complex64", 1 + 1j, "complex64"), + ("complex64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), + ("complex128", 1 + 1j, "complex128"), + ("complex128", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), + ], +) +def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype): + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_bool_with_any(any_numpy_dtype): + dtype = np.dtype(bool) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bool with anything but bool casts to object + expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_bool(any_numpy_dtype): + dtype = np.dtype(any_numpy_dtype) + fill_value = True + + # filling anything but bool with bool casts to object + expected_dtype = np.dtype(object) if dtype != bool else dtype + # output is not a generic bool, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype): + dtype = np.dtype(bytes_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_bytes(any_numpy_dtype): + dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype + fill_value = b"abc" + + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) + # output is not a generic bytes, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype): + dtype = np.dtype(datetime64_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetime with anything but datetime casts to object + if fill_dtype.kind == "M": + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to to_datetime64 + exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_with_datetime64(any_numpy_dtype, fill_value): + dtype = np.dtype(any_numpy_dtype) + + # filling datetime with anything but datetime casts to object + if dtype.kind == "M": + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + if type(fill_value) is datetime.date and dtype.kind == "M": + # Casting date to dt64 is deprecated, in 2.0 enforced to cast to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp(2023, 1, 1), + np.datetime64("2023-01-01"), + datetime.datetime(2023, 1, 1), + datetime.date(2023, 1, 1), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_numpy_dtype_with_datetimetz( + any_numpy_dtype, tz_aware_fixture, fill_value +): + dtype = np.dtype(any_numpy_dtype) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) + + fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] + + # filling any numpy dtype with datetimetz casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype): + dtype = np.dtype(timedelta64_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling timedelta with anything but timedelta casts to object + if fill_dtype.kind == "m": + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [pd.Timedelta(days=1), np.timedelta64(24, "h"), datetime.timedelta(1)], + ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], +) +def test_maybe_promote_any_with_timedelta64(any_numpy_dtype, fill_value): + dtype = np.dtype(any_numpy_dtype) + + # filling anything but timedelta with timedelta casts to object + if dtype.kind == "m": + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype): + dtype = np.dtype(string_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_string(any_numpy_dtype): + dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype + fill_value = "abc" + + # filling anything with a string casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype): + dtype = np.dtype(object_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_object(any_numpy_dtype): + dtype = np.dtype(any_numpy_dtype) + + # create array of object dtype from a scalar value (i.e. passing + # dtypes.common.is_scalar), which can however not be cast to int/float etc. + fill_value = pd.DateOffset(1) + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype, nulls_fixture): + fill_value = nulls_fixture + dtype = np.dtype(any_numpy_dtype) + + if isinstance(fill_value, Decimal): + # Subject to change, but ATM (When Decimal(NAN) is being added to nulls_fixture) + # this is the existing behavior in maybe_promote, + # hinges on is_valid_na_for_dtype + if dtype.kind in "iufc": + if dtype.kind in "iu": + expected_dtype = np.dtype(np.float64) + else: + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif dtype.kind in "iu" and fill_value is not NaT: + # integer + other missing value (np.nan / None) casts to float + expected_dtype = np.float64 + exp_val_for_scalar = np.nan + elif dtype == object and fill_value is NaT: + # inserting into object does not cast the value + # but *does* cast None to np.nan + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif dtype.kind in "mM": + # datetime / timedelta cast all missing values to dtyped-NaT + expected_dtype = dtype + exp_val_for_scalar = dtype.type("NaT", "ns") + elif fill_value is NaT: + # NaT upcasts everything that's not datetime/timedelta to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = NaT + elif dtype.kind in "fc": + # float / complex + missing value (!= NaT) stays the same + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + # all other cases cast to object, and use np.nan as missing value + expected_dtype = np.dtype(object) + if fill_value is pd.NA: + exp_val_for_scalar = pd.NA + else: + exp_val_for_scalar = np.nan + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ae5db26e86123e9696bf83de05551c681fb9aba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b86dd603522aa3f30d6eca99867630a6759a4982 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/test_array_with_attr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/test_array_with_attr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b8dd962340343cbe4d4ba341c37eb44b51b4450 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/array_with_attr/__pycache__/test_array_with_attr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24e48599391f82a9cf25a5020e0f424934437ea5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/accumulate.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/accumulate.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b58325881ec6f9f3e5150cb10bc9fe4e96aca4c5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/accumulate.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..329a3cc9f4b60f5cd4cb8014d421e6e4afa9701c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/casting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/casting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b061a5a9a25d0a97b3f89d5e969eb4f5adcd664 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/casting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c332b47dfa52cc2e146ff6217ee602b11fe273ab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dim2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dim2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..732d3ce9c1fca89ccedef9d1f0e9e23878f9ba91 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dim2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dtype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dtype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b61578674ad0f5ba9c03c7501560f311ee2fc252 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/dtype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/getitem.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/getitem.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b165a871926fe325ab414424bf588faada979230 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/getitem.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/groupby.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/groupby.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b17cdc212c09b120729c9304a4a7559bb87099b7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/groupby.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61b8893e6689a58279fc4fca94ae008e8061b723 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/interface.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c822970de6c984a741c07fc78455b10bbbc80d4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/interface.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/io.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/io.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..509edcaae9852a2799d932c2263e457977a7f76e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/io.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/methods.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/methods.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b766915b7c170922019309e62d236ff133ea5cd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/methods.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22e5198cdc8f86127d9fe8bc9a952ab48cd36eb2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a1c35337278180da04fa6a4b65b02c600eecacf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/printing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/printing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..220b9612525432ce1d92ddcbaccb95a48f24bda1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/printing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reduce.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reduce.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b58020b166d3bfb1ddc9fd70deaa6e1d7c7cea7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reduce.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reshaping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reshaping.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4c3c50304e021ff39c1e7fcadb6a03a66764988 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/reshaping.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/setitem.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/setitem.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7373bb06fcd539f7b706571922ad8d405e9af976 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/base/__pycache__/setitem.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf72a5e526de1d40bdc5e17a048ba84419b7a04f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c4eeaef70c141965b8f53dd6f74c32be82a6384 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/date/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..330c45fc4da180e626afc19b3b217bc6c30a1397 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..810f796097dbd860331de02040d91e1e1d016cdc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/test_decimal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/test_decimal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72ec3567e597230ab1b137ae93c136fbfce376a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/decimal/__pycache__/test_decimal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4eaeed75d891e8e7de1813d390989d059a9f4f3b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d78218d7beaced6b970cab9b5b774a391b02f271 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/test_json.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/test_json.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93eb5591a3df317d1444c4d31ce16c5a9bed21e8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/json/__pycache__/test_json.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/list/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/list/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59d67b153fc06b2f946e87ed2b063ddae1fc8b4b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/extension/list/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19b317b02f23576149a1a8d15b5a93973f89c222 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_duplicate_labels.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_duplicate_labels.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d2418a0764b4a14fac873899ac23a3286ada24e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_duplicate_labels.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_finalize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_finalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb9ef2a6a77fd24ebcc2d16208dd331d5f50daff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_finalize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_frame.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_frame.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a85ee868fdc7a78c06f1ec0688d27cfe28907afe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_frame.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_generic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_generic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be4a0f21740472ec0c2648042cf179a2c6384d1e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_generic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_label_or_level_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_label_or_level_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba9eddf0d297a141b482c82255a6f3851378d843 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_label_or_level_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_series.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_series.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0e43eda5af26c861ef6feac1ed10b6cace26c7c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_series.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_to_xarray.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_to_xarray.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21ab3cdad1ac21843d3d5eb04b091ce4d09a4c80 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/__pycache__/test_to_xarray.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_duplicate_labels.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_duplicate_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..f54db07824daf15eb01c32490495deff3736b14d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_duplicate_labels.py @@ -0,0 +1,413 @@ +"""Tests dealing with the NDFrame.allows_duplicates.""" +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +not_implemented = pytest.mark.xfail(reason="Not implemented.") + +# ---------------------------------------------------------------------------- +# Preservation + + +class TestPreserves: + @pytest.mark.parametrize( + "cls, data", + [ + (pd.Series, np.array([])), + (pd.Series, [1, 2]), + (pd.DataFrame, {}), + (pd.DataFrame, {"A": [1, 2]}), + ], + ) + def test_construction_ok(self, cls, data): + result = cls(data) + assert result.flags.allows_duplicate_labels is True + + result = cls(data).set_flags(allows_duplicate_labels=False) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "func", + [ + operator.itemgetter(["a"]), + operator.methodcaller("add", 1), + operator.methodcaller("rename", str.upper), + operator.methodcaller("rename", "name"), + operator.methodcaller("abs"), + np.abs, + ], + ) + def test_preserved_series(self, func): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + assert func(s).flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] + ) + # TODO: frame + @not_implemented + def test_align(self, other): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + a, b = s.align(other) + assert a.flags.allows_duplicate_labels is False + assert b.flags.allows_duplicate_labels is False + + def test_preserved_frame(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + assert df.loc[["a"]].flags.allows_duplicate_labels is False + assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False + + def test_to_frame(self): + ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) + assert ser.to_frame().flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("func", ["add", "sub"]) + @pytest.mark.parametrize("frame", [False, True]) + @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) + def test_binops(self, func, other, frame): + df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if frame: + df = df.to_frame() + if isinstance(other, pd.Series) and frame: + other = other.to_frame() + func = operator.methodcaller(func, other) + assert df.flags.allows_duplicate_labels is False + assert func(df).flags.allows_duplicate_labels is False + + def test_preserve_getitem(self): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + assert df[["A"]].flags.allows_duplicate_labels is False + assert df["A"].flags.allows_duplicate_labels is False + assert df.loc[0].flags.allows_duplicate_labels is False + assert df.loc[[0]].flags.allows_duplicate_labels is False + assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False + + def test_ndframe_getitem_caching_issue( + self, request, using_copy_on_write, warn_copy_on_write + ): + if not (using_copy_on_write or warn_copy_on_write): + request.applymarker(pytest.mark.xfail(reason="Unclear behavior.")) + # NDFrame.__getitem__ will cache the first df['A']. May need to + # invalidate that cache? Update the cached entries? + df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) + assert df["A"].flags.allows_duplicate_labels is False + df.flags.allows_duplicate_labels = True + assert df["A"].flags.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "objs, kwargs", + [ + # Series + ( + [ + pd.Series(1, index=["a", "b"]), + pd.Series(2, index=["c", "d"]), + ], + {}, + ), + ( + [ + pd.Series(1, index=["a", "b"]), + pd.Series(2, index=["a", "b"]), + ], + {"ignore_index": True}, + ), + ( + [ + pd.Series(1, index=["a", "b"]), + pd.Series(2, index=["a", "b"]), + ], + {"axis": 1}, + ), + # Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]), + pd.DataFrame({"A": [1, 2]}, index=["c", "d"]), + ], + {}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]), + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]), + ], + {"ignore_index": True}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]), + pd.DataFrame({"B": [1, 2]}, index=["a", "b"]), + ], + {"axis": 1}, + ), + # Series / Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]), + pd.Series([1, 2], index=["a", "b"], name="B"), + ], + {"axis": 1}, + ), + ], + ) + def test_concat(self, objs, kwargs): + objs = [x.set_flags(allows_duplicate_labels=False) for x in objs] + result = pd.concat(objs, **kwargs) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "left, right, expected", + [ + # false false false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags( + allows_duplicate_labels=False + ), + False, + marks=not_implemented, + ), + # false true false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + False, + marks=not_implemented, + ), + # true true true + ( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + True, + ), + ], + ) + def test_merge(self, left, right, expected): + result = pd.merge(left, right, left_index=True, right_index=True) + assert result.flags.allows_duplicate_labels is expected + + @not_implemented + def test_groupby(self): + # XXX: This is under tested + # TODO: + # - apply + # - transform + # - Should passing a grouper that disallows duplicates propagate? + df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False) + result = df.groupby([0, 0, 1]).agg("count") + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("frame", [True, False]) + @not_implemented + def test_window(self, frame): + df = pd.Series( + 1, + index=pd.date_range("2000", periods=12), + name="A", + allows_duplicate_labels=False, + ) + if frame: + df = df.to_frame() + assert df.rolling(3).mean().flags.allows_duplicate_labels is False + assert df.ewm(3).mean().flags.allows_duplicate_labels is False + assert df.expanding(3).mean().flags.allows_duplicate_labels is False + + +# ---------------------------------------------------------------------------- +# Raises + + +class TestRaises: + @pytest.mark.parametrize( + "cls, axes", + [ + (pd.Series, {"index": ["a", "a"], "dtype": float}), + (pd.DataFrame, {"index": ["a", "a"]}), + (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}), + (pd.DataFrame, {"columns": ["b", "b"]}), + ], + ) + def test_set_flags_with_duplicates(self, cls, axes): + result = cls(**axes) + assert result.flags.allows_duplicate_labels is True + + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + cls(**axes).set_flags(allows_duplicate_labels=False) + + @pytest.mark.parametrize( + "data", + [ + pd.Series(index=[0, 0], dtype=float), + pd.DataFrame(index=[0, 0]), + pd.DataFrame(columns=[0, 0]), + ], + ) + def test_setting_allows_duplicate_labels_raises(self, data): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + data.flags.allows_duplicate_labels = False + + assert data.flags.allows_duplicate_labels is True + + def test_series_raises(self): + a = pd.Series(0, index=["a", "b"]) + b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.concat([a, b]) + + @pytest.mark.parametrize( + "getter, target", + [ + (operator.itemgetter(["A", "A"]), None), + # loc + (operator.itemgetter(["a", "a"]), "loc"), + pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"), + (operator.itemgetter((["a", "a"], "A")), "loc"), + # iloc + (operator.itemgetter([0, 0]), "iloc"), + pytest.param(operator.itemgetter((0, [0, 0])), "iloc"), + pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"), + ], + ) + def test_getitem_raises(self, getter, target): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if target: + # df, df.loc, or df.iloc + target = getattr(df, target) + else: + target = df + + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + getter(target) + + @pytest.mark.parametrize( + "objs, kwargs", + [ + ( + [ + pd.Series(1, index=[0, 1], name="a"), + pd.Series(2, index=[0, 1], name="a"), + ], + {"axis": 1}, + ) + ], + ) + def test_concat_raises(self, objs, kwargs): + objs = [x.set_flags(allows_duplicate_labels=False) for x in objs] + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.concat(objs, **kwargs) + + @not_implemented + def test_merge_raises(self): + a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( + allows_duplicate_labels=False + ) + b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.merge(a, b, left_index=True, right_index=True) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 1]), + pd.Index(["a", "a"]), + pd.Index([1.1, 1.1]), + pd.PeriodIndex([pd.Period("2000", "D")] * 2), + pd.DatetimeIndex([pd.Timestamp("2000")] * 2), + pd.TimedeltaIndex([pd.Timedelta("1D")] * 2), + pd.CategoricalIndex(["a", "a"]), + pd.IntervalIndex([pd.Interval(0, 1)] * 2), + pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]), + ], + ids=lambda x: type(x).__name__, +) +def test_raises_basic(idx): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): + pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) + + +def test_format_duplicate_labels_message(): + idx = pd.Index(["a", "b", "a", "b", "c"]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label") + ) + tm.assert_frame_equal(result, expected) + + +def test_format_duplicate_labels_message_multi(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, + index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_insert_raises(): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + msg = "Cannot specify" + with pytest.raises(ValueError, match=msg): + df.insert(0, "A", [3, 4], allow_duplicates=True) + + +@pytest.mark.parametrize( + "method, frame_only", + [ + (operator.methodcaller("set_index", "A", inplace=True), True), + (operator.methodcaller("reset_index", inplace=True), True), + (operator.methodcaller("rename", lambda x: x, inplace=True), False), + ], +) +def test_inplace_raises(method, frame_only): + df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags( + allows_duplicate_labels=False + ) + s = df["A"] + s.flags.allows_duplicate_labels = False + msg = "Cannot specify" + + with pytest.raises(ValueError, match=msg): + method(df) + if not frame_only: + with pytest.raises(ValueError, match=msg): + method(s) + + +def test_pickle(): + a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_series_equal(a, b) + + a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_frame_equal(a, b) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_finalize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_finalize.py new file mode 100644 index 0000000000000000000000000000000000000000..866e9e203ffe3ac1fe29d86b87bbacccf1268e12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_finalize.py @@ -0,0 +1,767 @@ +""" +An exhaustive list of pandas methods exercising NDFrame.__finalize__. +""" +import operator +import re + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +# TODO: +# * Binary methods (mul, div, etc.) +# * Binary outputs (align, etc.) +# * top-level methods (concat, merge, get_dummies, etc.) +# * window +# * cumulative reductions + +not_implemented_mark = pytest.mark.xfail(reason="not implemented") + +mi = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["A", "B"]) + +frame_data = ({"A": [1]},) +frame_mi_data = ({"A": [1, 2, 3, 4]}, mi) + + +# Tuple of +# - Callable: Constructor (Series, DataFrame) +# - Tuple: Constructor args +# - Callable: pass the constructed value with attrs set to this. + +_all_methods = [ + (pd.Series, ([0],), operator.methodcaller("take", [])), + (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), + (pd.Series, ([0],), operator.methodcaller("repeat", 2)), + (pd.Series, ([0],), operator.methodcaller("reset_index")), + (pd.Series, ([0],), operator.methodcaller("reset_index", drop=True)), + (pd.Series, ([0],), operator.methodcaller("to_frame")), + (pd.Series, ([0, 0],), operator.methodcaller("drop_duplicates")), + (pd.Series, ([0, 0],), operator.methodcaller("duplicated")), + (pd.Series, ([0, 0],), operator.methodcaller("round")), + (pd.Series, ([0, 0],), operator.methodcaller("rename", lambda x: x + 1)), + (pd.Series, ([0, 0],), operator.methodcaller("rename", "name")), + (pd.Series, ([0, 0],), operator.methodcaller("set_axis", ["a", "b"])), + (pd.Series, ([0, 0],), operator.methodcaller("reindex", [1, 0])), + (pd.Series, ([0, 0],), operator.methodcaller("drop", [0])), + (pd.Series, (pd.array([0, pd.NA]),), operator.methodcaller("fillna", 0)), + (pd.Series, ([0, 0],), operator.methodcaller("replace", {0: 1})), + (pd.Series, ([0, 0],), operator.methodcaller("shift")), + (pd.Series, ([0, 0],), operator.methodcaller("isin", [0, 1])), + (pd.Series, ([0, 0],), operator.methodcaller("between", 0, 2)), + (pd.Series, ([0, 0],), operator.methodcaller("isna")), + (pd.Series, ([0, 0],), operator.methodcaller("isnull")), + (pd.Series, ([0, 0],), operator.methodcaller("notna")), + (pd.Series, ([0, 0],), operator.methodcaller("notnull")), + (pd.Series, ([1],), operator.methodcaller("add", pd.Series([1]))), + # TODO: mul, div, etc. + ( + pd.Series, + ([0], pd.period_range("2000", periods=1)), + operator.methodcaller("to_timestamp"), + ), + ( + pd.Series, + ([0], pd.date_range("2000", periods=1)), + operator.methodcaller("to_period"), + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("dot", pd.DataFrame(index=["A"])), + ), + marks=pytest.mark.xfail(reason="Implement binary finalize"), + ), + (pd.DataFrame, frame_data, operator.methodcaller("transpose")), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", np.array([True]))), + (pd.DataFrame, ({("A", "a"): [1]},), operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("query", "A == 1")), + (pd.DataFrame, frame_data, operator.methodcaller("eval", "A + 1", engine="python")), + (pd.DataFrame, frame_data, operator.methodcaller("select_dtypes", include="int")), + (pd.DataFrame, frame_data, operator.methodcaller("assign", b=1)), + (pd.DataFrame, frame_data, operator.methodcaller("set_axis", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("reindex", [0, 1])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", columns=["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", index=[0])), + (pd.DataFrame, frame_data, operator.methodcaller("rename", columns={"A": "a"})), + (pd.DataFrame, frame_data, operator.methodcaller("rename", index=lambda x: x)), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", method="ffill")), + (pd.DataFrame, frame_data, operator.methodcaller("set_index", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("reset_index")), + (pd.DataFrame, frame_data, operator.methodcaller("isna")), + (pd.DataFrame, frame_data, operator.methodcaller("isnull")), + (pd.DataFrame, frame_data, operator.methodcaller("notna")), + (pd.DataFrame, frame_data, operator.methodcaller("notnull")), + (pd.DataFrame, frame_data, operator.methodcaller("dropna")), + (pd.DataFrame, frame_data, operator.methodcaller("drop_duplicates")), + (pd.DataFrame, frame_data, operator.methodcaller("duplicated")), + (pd.DataFrame, frame_data, operator.methodcaller("sort_values", by="A")), + (pd.DataFrame, frame_data, operator.methodcaller("sort_index")), + (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")), + (pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")), + (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel")), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("add", pd.DataFrame(*frame_data)), + ), + # TODO: div, mul, etc. + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add), + ), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine_first", pd.DataFrame(*frame_data)), + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("update", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")), + ( + pd.DataFrame, + ({"A": [1], "B": [1]},), + operator.methodcaller("pivot_table", columns="A"), + ), + ( + pd.DataFrame, + ({"A": [1], "B": [1]},), + operator.methodcaller("pivot_table", columns="A", aggfunc=["mean", "sum"]), + ), + (pd.DataFrame, frame_data, operator.methodcaller("stack")), + (pd.DataFrame, frame_data, operator.methodcaller("explode", "A")), + (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")), + ( + pd.DataFrame, + ({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]},), + operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), + ), + (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("merge", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), + (pd.DataFrame, frame_data, operator.methodcaller("corr")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("cov")), + marks=[ + pytest.mark.filterwarnings("ignore::RuntimeWarning"), + ], + ), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("corrwith", pd.DataFrame(*frame_data)), + ), + (pd.DataFrame, frame_data, operator.methodcaller("count")), + (pd.DataFrame, frame_data, operator.methodcaller("nunique")), + (pd.DataFrame, frame_data, operator.methodcaller("idxmin")), + (pd.DataFrame, frame_data, operator.methodcaller("idxmax")), + (pd.DataFrame, frame_data, operator.methodcaller("mode")), + (pd.Series, [0], operator.methodcaller("mode")), + (pd.DataFrame, frame_data, operator.methodcaller("median")), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("quantile", numeric_only=True), + ), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("quantile", q=[0.25, 0.75], numeric_only=True), + ), + ( + pd.DataFrame, + ({"A": [pd.Timedelta(days=1), pd.Timedelta(days=2)]},), + operator.methodcaller("quantile", numeric_only=False), + ), + ( + pd.DataFrame, + ({"A": [np.datetime64("2022-01-01"), np.datetime64("2022-01-02")]},), + operator.methodcaller("quantile", numeric_only=True), + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Period("2000", "D")]), + operator.methodcaller("to_timestamp"), + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Timestamp("2000")]), + operator.methodcaller("to_period", freq="D"), + ), + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", [1])), + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", pd.Series([1]))), + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("isin", pd.DataFrame({"A": [1]})), + ), + (pd.DataFrame, frame_mi_data, operator.methodcaller("droplevel", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("pop", "A")), + # Squeeze on columns, otherwise we'll end up with a scalar + (pd.DataFrame, frame_data, operator.methodcaller("squeeze", axis="columns")), + (pd.Series, ([1, 2],), operator.methodcaller("squeeze")), + (pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")), + (pd.DataFrame, frame_data, operator.methodcaller("rename_axis", columns="a")), + # Unary ops + (pd.DataFrame, frame_data, operator.neg), + (pd.Series, [1], operator.neg), + (pd.DataFrame, frame_data, operator.pos), + (pd.Series, [1], operator.pos), + (pd.DataFrame, frame_data, operator.inv), + (pd.Series, [1], operator.inv), + (pd.DataFrame, frame_data, abs), + (pd.Series, [1], abs), + (pd.DataFrame, frame_data, round), + (pd.Series, [1], round), + (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])), + (pd.DataFrame, frame_mi_data, operator.methodcaller("xs", "a")), + (pd.Series, (1, mi), operator.methodcaller("xs", "a")), + (pd.DataFrame, frame_data, operator.methodcaller("get", "A")), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("reindex_like", pd.DataFrame({"A": [1, 2, 3]})), + ), + ( + pd.Series, + frame_data, + operator.methodcaller("reindex_like", pd.Series([0, 1, 2])), + ), + (pd.DataFrame, frame_data, operator.methodcaller("add_prefix", "_")), + (pd.DataFrame, frame_data, operator.methodcaller("add_suffix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_prefix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_suffix", "_")), + (pd.Series, ([3, 2],), operator.methodcaller("sort_values")), + (pd.Series, ([1] * 10,), operator.methodcaller("head")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("head")), + (pd.Series, ([1] * 10,), operator.methodcaller("tail")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("tail")), + (pd.Series, ([1, 2],), operator.methodcaller("sample", n=2, replace=True)), + (pd.DataFrame, (frame_data,), operator.methodcaller("sample", n=2, replace=True)), + (pd.Series, ([1, 2],), operator.methodcaller("astype", float)), + (pd.DataFrame, frame_data, operator.methodcaller("astype", float)), + (pd.Series, ([1, 2],), operator.methodcaller("copy")), + (pd.DataFrame, frame_data, operator.methodcaller("copy")), + (pd.Series, ([1, 2], None, object), operator.methodcaller("infer_objects")), + ( + pd.DataFrame, + ({"A": np.array([1, 2], dtype=object)},), + operator.methodcaller("infer_objects"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("convert_dtypes")), + (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")), + (pd.Series, ([1, None, 3],), operator.methodcaller("interpolate")), + (pd.DataFrame, ({"A": [1, None, 3]},), operator.methodcaller("interpolate")), + (pd.Series, ([1, 2],), operator.methodcaller("clip", lower=1)), + (pd.DataFrame, frame_data, operator.methodcaller("clip", lower=1)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "h"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "h"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("rank")), + (pd.DataFrame, frame_data, operator.methodcaller("rank")), + (pd.Series, ([1, 2],), operator.methodcaller("where", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("truncate", before=0)), + (pd.DataFrame, frame_data, operator.methodcaller("truncate", before=0)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("describe")), + (pd.DataFrame, frame_data, operator.methodcaller("describe")), + (pd.Series, ([1, 2],), operator.methodcaller("pct_change")), + (pd.DataFrame, frame_data, operator.methodcaller("pct_change")), + (pd.Series, ([1],), operator.methodcaller("transform", lambda x: x - x.min())), + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("transform", lambda x: x - x.min()), + ), + (pd.Series, ([1],), operator.methodcaller("apply", lambda x: x)), + (pd.DataFrame, frame_mi_data, operator.methodcaller("apply", lambda x: x)), + # Cumulative reductions + (pd.Series, ([1],), operator.methodcaller("cumsum")), + (pd.DataFrame, frame_data, operator.methodcaller("cumsum")), + (pd.Series, ([1],), operator.methodcaller("cummin")), + (pd.DataFrame, frame_data, operator.methodcaller("cummin")), + (pd.Series, ([1],), operator.methodcaller("cummax")), + (pd.DataFrame, frame_data, operator.methodcaller("cummax")), + (pd.Series, ([1],), operator.methodcaller("cumprod")), + (pd.DataFrame, frame_data, operator.methodcaller("cumprod")), + # Reductions + (pd.DataFrame, frame_data, operator.methodcaller("any")), + (pd.DataFrame, frame_data, operator.methodcaller("all")), + (pd.DataFrame, frame_data, operator.methodcaller("min")), + (pd.DataFrame, frame_data, operator.methodcaller("max")), + (pd.DataFrame, frame_data, operator.methodcaller("sum")), + (pd.DataFrame, frame_data, operator.methodcaller("std")), + (pd.DataFrame, frame_data, operator.methodcaller("mean")), + (pd.DataFrame, frame_data, operator.methodcaller("prod")), + (pd.DataFrame, frame_data, operator.methodcaller("sem")), + (pd.DataFrame, frame_data, operator.methodcaller("skew")), + (pd.DataFrame, frame_data, operator.methodcaller("kurt")), +] + + +def idfn(x): + xpr = re.compile(r"'(.*)?'") + m = xpr.search(str(x)) + if m: + return m.group(1) + else: + return str(x) + + +@pytest.fixture(params=_all_methods, ids=lambda x: idfn(x[-1])) +def ndframe_method(request): + """ + An NDFrame method returning an NDFrame. + """ + return request.param + + +@pytest.mark.filterwarnings( + "ignore:DataFrame.fillna with 'method' is deprecated:FutureWarning", + "ignore:last is deprecated:FutureWarning", +) +def test_finalize_called(ndframe_method): + cls, init_args, method = ndframe_method + ndframe = cls(*init_args) + + ndframe.attrs = {"a": 1} + result = method(ndframe) + + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "data", + [ + pd.Series(1, pd.date_range("2000", periods=4)), + pd.DataFrame({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + ], +) +def test_finalize_first(data): + deprecated_msg = "first is deprecated" + + data.attrs = {"a": 1} + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = data.first("3D") + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "data", + [ + pd.Series(1, pd.date_range("2000", periods=4)), + pd.DataFrame({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + ], +) +def test_finalize_last(data): + # GH 53710 + deprecated_msg = "last is deprecated" + + data.attrs = {"a": 1} + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = data.last("3D") + assert result.attrs == {"a": 1} + + +@not_implemented_mark +def test_finalize_called_eval_numexpr(): + pytest.importorskip("numexpr") + df = pd.DataFrame({"A": [1, 2]}) + df.attrs["A"] = 1 + result = df.eval("A + 1", engine="numexpr") + assert result.attrs == {"A": 1} + + +# ---------------------------------------------------------------------------- +# Binary operations + + +@pytest.mark.parametrize("annotate", ["left", "right", "both"]) +@pytest.mark.parametrize( + "args", + [ + (1, pd.Series([1])), + (1, pd.DataFrame({"A": [1]})), + (pd.Series([1]), 1), + (pd.DataFrame({"A": [1]}), 1), + (pd.Series([1]), pd.Series([1])), + (pd.DataFrame({"A": [1]}), pd.DataFrame({"A": [1]})), + (pd.Series([1]), pd.DataFrame({"A": [1]})), + (pd.DataFrame({"A": [1]}), pd.Series([1])), + ], + ids=lambda x: f"({type(x[0]).__name__},{type(x[1]).__name__})", +) +def test_binops(request, args, annotate, all_binary_operators): + # This generates 624 tests... Is that needed? + left, right = args + if isinstance(left, (pd.DataFrame, pd.Series)): + left.attrs = {} + if isinstance(right, (pd.DataFrame, pd.Series)): + right.attrs = {} + + if annotate == "left" and isinstance(left, int): + pytest.skip("left is an int and doesn't support .attrs") + if annotate == "right" and isinstance(right, int): + pytest.skip("right is an int and doesn't support .attrs") + + if not (isinstance(left, int) or isinstance(right, int)) and annotate != "both": + if not all_binary_operators.__name__.startswith("r"): + if annotate == "right" and isinstance(left, type(right)): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when right has " + f"attrs and both are {type(left)}" + ) + ) + if not isinstance(left, type(right)): + if annotate == "left" and isinstance(left, pd.Series): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when the " + "objects are different Series has attrs" + ) + ) + elif annotate == "right" and isinstance(right, pd.Series): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when the " + "objects are different Series has attrs" + ) + ) + else: + if annotate == "left" and isinstance(left, type(right)): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when left has " + f"attrs and both are {type(left)}" + ) + ) + if not isinstance(left, type(right)): + if annotate == "right" and isinstance(right, pd.Series): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when the " + "objects are different Series has attrs" + ) + ) + elif annotate == "left" and isinstance(left, pd.Series): + request.applymarker( + pytest.mark.xfail( + reason=f"{all_binary_operators} doesn't work when the " + "objects are different Series has attrs" + ) + ) + if annotate in {"left", "both"} and not isinstance(left, int): + left.attrs = {"a": 1} + if annotate in {"right", "both"} and not isinstance(right, int): + right.attrs = {"a": 1} + + is_cmp = all_binary_operators in [ + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ] + if is_cmp and isinstance(left, pd.DataFrame) and isinstance(right, pd.Series): + # in 2.0 silent alignment on comparisons was removed xref GH#28759 + left, right = left.align(right, axis=1, copy=False) + elif is_cmp and isinstance(left, pd.Series) and isinstance(right, pd.DataFrame): + right, left = right.align(left, axis=1, copy=False) + + result = all_binary_operators(left, right) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Accessors + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("capitalize"), + operator.methodcaller("casefold"), + operator.methodcaller("cat", ["a"]), + operator.methodcaller("contains", "a"), + operator.methodcaller("count", "a"), + operator.methodcaller("encode", "utf-8"), + operator.methodcaller("endswith", "a"), + operator.methodcaller("extract", r"(\w)(\d)"), + operator.methodcaller("extract", r"(\w)(\d)", expand=False), + operator.methodcaller("find", "a"), + operator.methodcaller("findall", "a"), + operator.methodcaller("get", 0), + operator.methodcaller("index", "a"), + operator.methodcaller("len"), + operator.methodcaller("ljust", 4), + operator.methodcaller("lower"), + operator.methodcaller("lstrip"), + operator.methodcaller("match", r"\w"), + operator.methodcaller("normalize", "NFC"), + operator.methodcaller("pad", 4), + operator.methodcaller("partition", "a"), + operator.methodcaller("repeat", 2), + operator.methodcaller("replace", "a", "b"), + operator.methodcaller("rfind", "a"), + operator.methodcaller("rindex", "a"), + operator.methodcaller("rjust", 4), + operator.methodcaller("rpartition", "a"), + operator.methodcaller("rstrip"), + operator.methodcaller("slice", 4), + operator.methodcaller("slice_replace", 1, repl="a"), + operator.methodcaller("startswith", "a"), + operator.methodcaller("strip"), + operator.methodcaller("swapcase"), + operator.methodcaller("translate", {"a": "b"}), + operator.methodcaller("upper"), + operator.methodcaller("wrap", 4), + operator.methodcaller("zfill", 4), + operator.methodcaller("isalnum"), + operator.methodcaller("isalpha"), + operator.methodcaller("isdigit"), + operator.methodcaller("isspace"), + operator.methodcaller("islower"), + operator.methodcaller("isupper"), + operator.methodcaller("istitle"), + operator.methodcaller("isnumeric"), + operator.methodcaller("isdecimal"), + operator.methodcaller("get_dummies"), + ], + ids=idfn, +) +def test_string_method(method): + s = pd.Series(["a1"]) + s.attrs = {"a": 1} + result = method(s.str) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("to_period"), + operator.methodcaller("tz_localize", "CET"), + operator.methodcaller("normalize"), + operator.methodcaller("strftime", "%Y"), + operator.methodcaller("round", "h"), + operator.methodcaller("floor", "h"), + operator.methodcaller("ceil", "h"), + operator.methodcaller("month_name"), + operator.methodcaller("day_name"), + ], + ids=idfn, +) +def test_datetime_method(method): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", + [ + "date", + "time", + "timetz", + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "dayofweek", + "day_of_week", + "dayofyear", + "day_of_year", + "quarter", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + "daysinmonth", + "days_in_month", + ], +) +def test_datetime_property(attr): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] +) +def test_timedelta_property(attr): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize("method", [operator.methodcaller("total_seconds")]) +def test_timedelta_methods(method): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("add_categories", ["c"]), + operator.methodcaller("as_ordered"), + operator.methodcaller("as_unordered"), + lambda x: getattr(x, "codes"), + operator.methodcaller("remove_categories", "a"), + operator.methodcaller("remove_unused_categories"), + operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), + operator.methodcaller("reorder_categories", ["b", "a"]), + operator.methodcaller("set_categories", ["A", "B"]), + ], +) +@not_implemented_mark +def test_categorical_accessor(method): + s = pd.Series(["a", "b"], dtype="category") + s.attrs = {"a": 1} + result = method(s.cat) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Groupby + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("sum"), + lambda x: x.apply(lambda y: y), + lambda x: x.agg("sum"), + lambda x: x.agg("mean"), + lambda x: x.agg("median"), + ], +) +def test_groupby_finalize(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0], group_keys=False)) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ + lambda x: x.agg(["sum", "count"]), + lambda x: x.agg("std"), + lambda x: x.agg("var"), + lambda x: x.agg("sem"), + lambda x: x.agg("size"), + lambda x: x.agg("ohlc"), + ], +) +@not_implemented_mark +def test_groupby_finalize_not_implemented(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} + + +def test_finalize_frame_series_name(): + # https://github.com/pandas-dev/pandas/pull/37186/files#r506978889 + # ensure we don't copy the column `name` to the Series. + df = pd.DataFrame({"name": [1, 2]}) + result = pd.Series([1, 2]).__finalize__(df) + assert result.name is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_frame.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..fc7aa9e7b2c46362aa9b6a9ebfc4f663cfd61058 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_frame.py @@ -0,0 +1,209 @@ +from copy import deepcopy +from operator import methodcaller + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrame: + @pytest.mark.parametrize("func", ["_set_axis_name", "rename_axis"]) + def test_set_axis_name(self, func): + df = DataFrame([[1, 2], [3, 4]]) + + result = methodcaller(func, "foo")(df) + assert df.index.name is None + assert result.index.name == "foo" + + result = methodcaller(func, "cols", axis=1)(df) + assert df.columns.name is None + assert result.columns.name == "cols" + + @pytest.mark.parametrize("func", ["_set_axis_name", "rename_axis"]) + def test_set_axis_name_mi(self, func): + df = DataFrame( + np.empty((3, 3)), + index=MultiIndex.from_tuples([("A", x) for x in list("aBc")]), + columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]), + ) + + level_names = ["L1", "L2"] + + result = methodcaller(func, level_names)(df) + assert result.index.names == level_names + assert result.columns.names == [None, None] + + result = methodcaller(func, level_names, axis=1)(df) + assert result.columns.names == ["L1", "L2"] + assert result.index.names == [None, None] + + def test_nonzero_single_element(self): + # allow single item via bool method + msg_warn = ( + "DataFrame.bool is now deprecated and will be removed " + "in future version of pandas" + ) + df = DataFrame([[True]]) + df1 = DataFrame([[False]]) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + assert df.bool() + + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + assert not df1.bool() + + df = DataFrame([[False, False]]) + msg_err = "The truth value of a DataFrame is ambiguous" + with pytest.raises(ValueError, match=msg_err): + bool(df) + + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + with pytest.raises(ValueError, match=msg_err): + df.bool() + + def test_metadata_propagation_indiv_groupby(self): + # groupby + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + result = df.groupby("A").sum() + tm.assert_metadata_equivalent(df, result) + + def test_metadata_propagation_indiv_resample(self): + # resample + df = DataFrame( + np.random.default_rng(2).standard_normal((1000, 2)), + index=date_range("20130101", periods=1000, freq="s"), + ) + result = df.resample("1min") + tm.assert_metadata_equivalent(df, result) + + def test_metadata_propagation_indiv(self, monkeypatch): + # merging with override + # GH 6923 + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == "merge": + left, right = other.left, other.right + value = getattr(left, name, "") + "|" + getattr(right, name, "") + object.__setattr__(self, name, value) + elif method == "concat": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, "")) + + return self + + with monkeypatch.context() as m: + m.setattr(DataFrame, "_metadata", ["filename"]) + m.setattr(DataFrame, "__finalize__", finalize) + + df1 = DataFrame( + np.random.default_rng(2).integers(0, 4, (3, 2)), columns=["a", "b"] + ) + df2 = DataFrame( + np.random.default_rng(2).integers(0, 4, (3, 2)), columns=["c", "d"] + ) + DataFrame._metadata = ["filename"] + df1.filename = "fname1.csv" + df2.filename = "fname2.csv" + + result = df1.merge(df2, left_on=["a"], right_on=["c"], how="inner") + assert result.filename == "fname1.csv|fname2.csv" + + # concat + # GH#6927 + df1 = DataFrame( + np.random.default_rng(2).integers(0, 4, (3, 2)), columns=list("ab") + ) + df1.filename = "foo" + + result = pd.concat([df1, df1]) + assert result.filename == "foo+foo" + + def test_set_attribute(self): + # Test for consistent setattr behavior when an attribute and a column + # have the same name (Issue #8994) + df = DataFrame({"x": [1, 2, 3]}) + + df.y = 2 + df["y"] = [2, 4, 6] + df.y = 5 + + assert df.y == 5 + tm.assert_series_equal(df["y"], Series([2, 4, 6], name="y")) + + def test_deepcopy_empty(self): + # This test covers empty frame copying with non-empty column sets + # as reported in issue GH15370 + empty_frame = DataFrame(data=[], index=[], columns=["A"]) + empty_frame_copy = deepcopy(empty_frame) + + tm.assert_frame_equal(empty_frame_copy, empty_frame) + + +# formerly in Generic but only test DataFrame +class TestDataFrame2: + @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) + def test_validate_bool_args(self, value): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): + df.copy().rename_axis(mapper={"a": "x", "b": "y"}, axis=1, inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy().drop("a", axis=1, inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy().fillna(value=0, inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy().replace(to_replace=1, value=7, inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy().interpolate(inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy()._where(cond=df.a > 2, inplace=value) + + with pytest.raises(ValueError, match=msg): + df.copy().mask(cond=df.a > 2, inplace=value) + + def test_unexpected_keyword(self): + # GH8597 + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 2)), columns=["jim", "joe"] + ) + ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) + ts = df["joe"].copy() + ts[2] = np.nan + + msg = "unexpected keyword" + with pytest.raises(TypeError, match=msg): + df.drop("joe", axis=1, in_place=True) + + with pytest.raises(TypeError, match=msg): + df.reindex([1, 0], inplace=True) + + with pytest.raises(TypeError, match=msg): + ca.fillna(0, inplace=True) + + with pytest.raises(TypeError, match=msg): + ts.fillna(0, in_place=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_generic.py new file mode 100644 index 0000000000000000000000000000000000000000..6564e381af0ea9b821e44f780ce209936f9524dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_generic.py @@ -0,0 +1,504 @@ +from copy import ( + copy, + deepcopy, +) + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +from pandas import ( + DataFrame, + Index, + Series, + date_range, +) +import pandas._testing as tm + +# ---------------------------------------------------------------------- +# Generic types test cases + + +def construct(box, shape, value=None, dtype=None, **kwargs): + """ + construct an object for the given shape + if value is specified use that if its a scalar + if value is an array, repeat it as needed + """ + if isinstance(shape, int): + shape = tuple([shape] * box._AXIS_LEN) + if value is not None: + if is_scalar(value): + if value == "empty": + arr = None + dtype = np.float64 + + # remove the info axis + kwargs.pop(box._info_axis_name, None) + else: + arr = np.empty(shape, dtype=dtype) + arr.fill(value) + else: + fshape = np.prod(shape) + arr = value.ravel() + new_shape = fshape / arr.shape[0] + if fshape % arr.shape[0] != 0: + raise Exception("invalid value passed in construct") + + arr = np.repeat(arr, new_shape).reshape(shape) + else: + arr = np.random.default_rng(2).standard_normal(shape) + return box(arr, dtype=dtype, **kwargs) + + +class TestGeneric: + @pytest.mark.parametrize( + "func", + [ + str.lower, + {x: x.lower() for x in list("ABCD")}, + Series({x: x.lower() for x in list("ABCD")}), + ], + ) + def test_rename(self, frame_or_series, func): + # single axis + idx = list("ABCD") + + for axis in frame_or_series._AXIS_ORDERS: + kwargs = {axis: idx} + obj = construct(frame_or_series, 4, **kwargs) + + # rename a single axis + result = obj.rename(**{axis: func}) + expected = obj.copy() + setattr(expected, axis, list("abcd")) + tm.assert_equal(result, expected) + + def test_get_numeric_data(self, frame_or_series): + n = 4 + kwargs = { + frame_or_series._get_axis_name(i): list(range(n)) + for i in range(frame_or_series._AXIS_LEN) + } + + # get the numeric data + o = construct(frame_or_series, n, **kwargs) + result = o._get_numeric_data() + tm.assert_equal(result, o) + + # non-inclusion + result = o._get_bool_data() + expected = construct(frame_or_series, n, value="empty", **kwargs) + if isinstance(o, DataFrame): + # preserve columns dtype + expected.columns = o.columns[:0] + # https://github.com/pandas-dev/pandas/issues/50862 + tm.assert_equal(result.reset_index(drop=True), expected) + + # get the bool data + arr = np.array([True, True, False, True]) + o = construct(frame_or_series, n, value=arr, **kwargs) + result = o._get_numeric_data() + tm.assert_equal(result, o) + + def test_nonzero(self, frame_or_series): + # GH 4633 + # look at the boolean/nonzero behavior for objects + obj = construct(frame_or_series, shape=4) + msg = f"The truth value of a {frame_or_series.__name__} is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + obj = construct(frame_or_series, shape=4, value=1) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + obj = construct(frame_or_series, shape=4, value=np.nan) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + # empty + obj = construct(frame_or_series, shape=0) + with pytest.raises(ValueError, match=msg): + bool(obj) + + # invalid behaviors + + obj1 = construct(frame_or_series, shape=4, value=1) + obj2 = construct(frame_or_series, shape=4, value=1) + + with pytest.raises(ValueError, match=msg): + if obj1: + pass + + with pytest.raises(ValueError, match=msg): + obj1 and obj2 + with pytest.raises(ValueError, match=msg): + obj1 or obj2 + with pytest.raises(ValueError, match=msg): + not obj1 + + def test_frame_or_series_compound_dtypes(self, frame_or_series): + # see gh-5191 + # Compound dtypes should raise NotImplementedError. + + def f(dtype): + return construct(frame_or_series, shape=3, value=1, dtype=dtype) + + msg = ( + "compound dtypes are not implemented " + f"in the {frame_or_series.__name__} constructor" + ) + + with pytest.raises(NotImplementedError, match=msg): + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) + + # these work (though results may be unexpected) + f("int64") + f("float64") + f("M8[ns]") + + def test_metadata_propagation(self, frame_or_series): + # check that the metadata matches up on the resulting ops + + o = construct(frame_or_series, shape=3) + o.name = "foo" + o2 = construct(frame_or_series, shape=3) + o2.name = "bar" + + # ---------- + # preserving + # ---------- + + # simple ops with scalars + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: + result = getattr(o, op)(1) + tm.assert_metadata_equivalent(o, result) + + # ops with like + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: + result = getattr(o, op)(o) + tm.assert_metadata_equivalent(o, result) + + # simple boolean + for op in ["__eq__", "__le__", "__ge__"]: + v1 = getattr(o, op)(o) + tm.assert_metadata_equivalent(o, v1) + tm.assert_metadata_equivalent(o, v1 & v1) + tm.assert_metadata_equivalent(o, v1 | v1) + + # combine_first + result = o.combine_first(o2) + tm.assert_metadata_equivalent(o, result) + + # --------------------------- + # non-preserving (by default) + # --------------------------- + + # add non-like + result = o + o2 + tm.assert_metadata_equivalent(result) + + # simple boolean + for op in ["__eq__", "__le__", "__ge__"]: + # this is a name matching op + v1 = getattr(o, op)(o) + v2 = getattr(o, op)(o2) + tm.assert_metadata_equivalent(v2) + tm.assert_metadata_equivalent(v1 & v2) + tm.assert_metadata_equivalent(v1 | v2) + + def test_size_compat(self, frame_or_series): + # GH8846 + # size property should be defined + + o = construct(frame_or_series, shape=10) + assert o.size == np.prod(o.shape) + assert o.size == 10 ** len(o.axes) + + def test_split_compat(self, frame_or_series): + # xref GH8846 + o = construct(frame_or_series, shape=10) + with tm.assert_produces_warning( + FutureWarning, match=".swapaxes' is deprecated", check_stacklevel=False + ): + assert len(np.array_split(o, 5)) == 5 + assert len(np.array_split(o, 2)) == 2 + + # See gh-12301 + def test_stat_unexpected_keyword(self, frame_or_series): + obj = construct(frame_or_series, 5) + starwars = "Star Wars" + errmsg = "unexpected keyword" + + with pytest.raises(TypeError, match=errmsg): + obj.max(epic=starwars) # stat_function + with pytest.raises(TypeError, match=errmsg): + obj.var(epic=starwars) # stat_function_ddof + with pytest.raises(TypeError, match=errmsg): + obj.sum(epic=starwars) # cum_function + with pytest.raises(TypeError, match=errmsg): + obj.any(epic=starwars) # logical_function + + @pytest.mark.parametrize("func", ["sum", "cumsum", "any", "var"]) + def test_api_compat(self, func, frame_or_series): + # GH 12021 + # compat for __name__, __qualname__ + + obj = construct(frame_or_series, 5) + f = getattr(obj, func) + assert f.__name__ == func + assert f.__qualname__.endswith(func) + + def test_stat_non_defaults_args(self, frame_or_series): + obj = construct(frame_or_series, 5) + out = np.array([0]) + errmsg = "the 'out' parameter is not supported" + + with pytest.raises(ValueError, match=errmsg): + obj.max(out=out) # stat_function + with pytest.raises(ValueError, match=errmsg): + obj.var(out=out) # stat_function_ddof + with pytest.raises(ValueError, match=errmsg): + obj.sum(out=out) # cum_function + with pytest.raises(ValueError, match=errmsg): + obj.any(out=out) # logical_function + + def test_truncate_out_of_bounds(self, frame_or_series): + # GH11382 + + # small + shape = [2000] + ([1] * (frame_or_series._AXIS_LEN - 1)) + small = construct(frame_or_series, shape, dtype="int8", value=1) + tm.assert_equal(small.truncate(), small) + tm.assert_equal(small.truncate(before=0, after=3e3), small) + tm.assert_equal(small.truncate(before=-1, after=2e3), small) + + # big + shape = [2_000_000] + ([1] * (frame_or_series._AXIS_LEN - 1)) + big = construct(frame_or_series, shape, dtype="int8", value=1) + tm.assert_equal(big.truncate(), big) + tm.assert_equal(big.truncate(before=0, after=3e6), big) + tm.assert_equal(big.truncate(before=-1, after=2e6), big) + + @pytest.mark.parametrize( + "func", + [copy, deepcopy, lambda x: x.copy(deep=False), lambda x: x.copy(deep=True)], + ) + @pytest.mark.parametrize("shape", [0, 1, 2]) + def test_copy_and_deepcopy(self, frame_or_series, shape, func): + # GH 15444 + obj = construct(frame_or_series, shape) + obj_copy = func(obj) + assert obj_copy is not obj + tm.assert_equal(obj_copy, obj) + + def test_data_deprecated(self, frame_or_series): + obj = frame_or_series() + msg = "(Series|DataFrame)._data is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + mgr = obj._data + assert mgr is obj._mgr + + +class TestNDFrame: + # tests that don't fit elsewhere + + @pytest.mark.parametrize( + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], + ) + def test_squeeze_series_noop(self, ser): + # noop + tm.assert_series_equal(ser.squeeze(), ser) + + def test_squeeze_frame_noop(self): + # noop + df = DataFrame(np.eye(2)) + tm.assert_frame_equal(df.squeeze(), df) + + def test_squeeze_frame_reindex(self): + # squeezing + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).reindex(columns=["A"]) + tm.assert_series_equal(df.squeeze(), df["A"]) + + def test_squeeze_0_len_dim(self): + # don't fail with 0 length dimensions GH11229 & GH8999 + empty_series = Series([], name="five", dtype=np.float64) + empty_frame = DataFrame([empty_series]) + tm.assert_series_equal(empty_series, empty_series.squeeze()) + tm.assert_series_equal(empty_series, empty_frame.squeeze()) + + def test_squeeze_axis(self): + # axis argument + df = DataFrame( + np.random.default_rng(2).standard_normal((1, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=1, freq="B"), + ).iloc[:, :1] + assert df.shape == (1, 1) + tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) + tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) + assert df.squeeze() == df.iloc[0, 0] + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.squeeze(axis=2) + msg = "No axis named x for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.squeeze(axis="x") + + def test_squeeze_axis_len_3(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=3, freq="B"), + ) + tm.assert_frame_equal(df.squeeze(axis=0), df) + + def test_numpy_squeeze(self): + s = Series(range(2), dtype=np.float64) + tm.assert_series_equal(np.squeeze(s), s) + + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).reindex(columns=["A"]) + tm.assert_series_equal(np.squeeze(df), df["A"]) + + @pytest.mark.parametrize( + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], + ) + def test_transpose_series(self, ser): + # calls implementation in pandas/core/base.py + tm.assert_series_equal(ser.transpose(), ser) + + def test_transpose_frame(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + tm.assert_frame_equal(df.transpose().transpose(), df) + + def test_numpy_transpose(self, frame_or_series): + obj = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + obj = tm.get_obj(obj, frame_or_series) + + if frame_or_series is Series: + # 1D -> np.transpose is no-op + tm.assert_series_equal(np.transpose(obj), obj) + + # round-trip preserved + tm.assert_equal(np.transpose(np.transpose(obj)), obj) + + msg = "the 'axes' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.transpose(obj, axes=1) + + @pytest.mark.parametrize( + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], + ) + def test_take_series(self, ser): + indices = [1, 5, -2, 6, 3, -1] + out = ser.take(indices) + expected = Series( + data=ser.values.take(indices), + index=ser.index.take(indices), + dtype=ser.dtype, + ) + tm.assert_series_equal(out, expected) + + def test_take_frame(self): + indices = [1, 5, -2, 6, 3, -1] + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + out = df.take(indices) + expected = DataFrame( + data=df.values.take(indices, axis=0), + index=df.index.take(indices), + columns=df.columns, + ) + tm.assert_frame_equal(out, expected) + + def test_take_invalid_kwargs(self, frame_or_series): + indices = [-3, 2, 0, 1] + + obj = DataFrame(range(5)) + obj = tm.get_obj(obj, frame_or_series) + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + obj.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + obj.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + obj.take(indices, mode="clip") + + def test_axis_classmethods(self, frame_or_series): + box = frame_or_series + obj = box(dtype=object) + values = box._AXIS_TO_AXIS_NUMBER.keys() + for v in values: + assert obj._get_axis_number(v) == box._get_axis_number(v) + assert obj._get_axis_name(v) == box._get_axis_name(v) + assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) + + def test_flags_identity(self, frame_or_series): + obj = Series([1, 2]) + if frame_or_series is DataFrame: + obj = obj.to_frame() + + assert obj.flags is obj.flags + obj2 = obj.copy() + assert obj2.flags is not obj.flags + + def test_bool_dep(self) -> None: + # GH-51749 + msg_warn = ( + "DataFrame.bool is now deprecated and will be removed " + "in future version of pandas" + ) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + DataFrame({"col": [False]}).bool() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_label_or_level_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_label_or_level_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..97be46f716d7daa98c1c1ebab04e1e6abb3a55bc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_label_or_level_utils.py @@ -0,0 +1,336 @@ +import pytest + +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd + + +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3'""" + return pd.DataFrame({"L1": [1, 2, 3], "L2": [11, 12, 13], "L3": ["A", "B", "C"]}) + + +@pytest.fixture(params=[[], ["L1"], ["L1", "L2"], ["L1", "L2", "L3"]]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3'""" + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3'""" + df = df.set_index(["L1", "L2"]) + + df["L1"] = df["L3"] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2'""" + df = df.set_index(["L1"]) + df = pd.concat([df, df["L2"]], axis=1) + + return df + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names if name is not None] + return expected_labels, expected_levels + + +def assert_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def assert_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +def test_is_level_or_label_reference_df_simple(df_levels, axis): + axis = df_levels._get_axis_number(axis) + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) + + +def test_is_level_reference_df_ambig(df_ambig, axis): + axis = df_ambig._get_axis_number(axis) + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + assert_label_reference(df_ambig, ["L1"], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + assert_level_reference(df_ambig, ["L2"], axis=axis) + + # df has a column named L3 and it not an level reference + assert_label_reference(df_ambig, ["L3"], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_level_reference(s, ["L1"], axis=0) + assert not s._is_level_reference("L2") + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_level_reference(s, ["L1", "L2"], axis=0) + assert not s._is_level_reference("L3") + + +def test_is_level_reference_series_axis1_error(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._is_level_reference("L1", axis=1) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + + +# DataFrame +# --------- +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + axis = df_ambig._get_axis_number(axis) + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + msg = "'L1' is both a column level and an index label" + + else: + msg = "'L1' is both an index level and a column label" + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore, L1 is ambiguous. + with pytest.raises(ValueError, match=msg): + df_ambig._check_label_or_level_ambiguity("L1", axis=axis) + + # df_ambig has an on-axis level named L2,, and it is not ambiguous. + df_ambig._check_label_or_level_ambiguity("L2", axis=axis) + + # df_ambig has an off-axis label named L3, and it is not ambiguous + assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index("L1").L2 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) + s._check_label_or_level_ambiguity("L3", axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._check_label_or_level_ambiguity("L1", axis=1) + + +# Test _get_label_or_level_values +# =============================== +def assert_label_values(frame, labels, axis): + axis = frame._get_axis_number(axis) + for label in labels: + if axis == 0: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def assert_level_values(frame, levels, axis): + axis = frame._get_axis_number(axis) + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = frame.columns.get_level_values(level=level)._values + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +def test_get_label_or_level_values_df_simple(df_levels, axis): + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + axis = df_levels._get_axis_number(axis) + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) + + +def test_get_label_or_level_values_df_ambig(df_ambig, axis): + axis = df_ambig._get_axis_number(axis) + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has an on-axis level named L2, and it is not ambiguous. + assert_level_values(df_ambig, ["L2"], axis=axis) + + # df has an off-axis label named L3, and it is not ambiguous. + assert_label_values(df_ambig, ["L3"], axis=axis) + + +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): + axis = df_duplabels._get_axis_number(axis) + # Transpose frame if axis == 1 + if axis == 1: + df_duplabels = df_duplabels.T + + # df has unambiguous level 'L1' + assert_level_values(df_duplabels, ["L1"], axis=axis) + + # df has unique label 'L3' + assert_label_values(df_duplabels, ["L3"], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with pytest.raises(ValueError, match=expected_msg): + assert_label_values(df_duplabels, ["L2"], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_level_values(s, ["L1"], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_level_values(s, ["L1", "L2"], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._get_label_or_level_values("L1", axis=1) + + +# Test _drop_labels_or_levels +# =========================== +def assert_labels_dropped(frame, labels, axis): + axis = frame._get_axis_number(axis) + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def assert_levels_dropped(frame, levels, axis): + axis = frame._get_axis_number(axis) + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + +# DataFrame +# --------- +def test_drop_labels_or_levels_df(df_levels, axis): + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + axis = df_levels._get_axis_number(axis) + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) + + with pytest.raises(ValueError, match="not valid labels or levels"): + df_levels._drop_labels_or_levels("L4", axis=axis) + + +# Series +# ------ +def test_drop_labels_or_levels_series(df): + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_levels_dropped(s, ["L1"], axis=0) + + with pytest.raises(ValueError, match="not valid labels or levels"): + s._drop_labels_or_levels("L4", axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_levels_dropped(s, ["L1", "L2"], axis=0) + + with pytest.raises(ValueError, match="not valid labels or levels"): + s._drop_labels_or_levels("L4", axis=0) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_series.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_series.py new file mode 100644 index 0000000000000000000000000000000000000000..3648961eb3808a316b2a23d3d720fdd26fe7fd06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_series.py @@ -0,0 +1,159 @@ +from operator import methodcaller + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestSeries: + @pytest.mark.parametrize("func", ["rename_axis", "_set_axis_name"]) + def test_set_axis_name_mi(self, func): + ser = Series( + [11, 21, 31], + index=MultiIndex.from_tuples( + [("A", x) for x in ["a", "B", "c"]], names=["l1", "l2"] + ), + ) + + result = methodcaller(func, ["L1", "L2"])(ser) + assert ser.index.name is None + assert ser.index.names == ["l1", "l2"] + assert result.index.name is None + assert result.index.names, ["L1", "L2"] + + def test_set_axis_name_raises(self): + ser = Series([1]) + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + ser._set_axis_name(name="a", axis=1) + + def test_get_bool_data_preserve_dtype(self): + ser = Series([True, False, True]) + result = ser._get_bool_data() + tm.assert_series_equal(result, ser) + + def test_nonzero_single_element(self): + # allow single item via bool method + msg_warn = ( + "Series.bool is now deprecated and will be removed " + "in future version of pandas" + ) + ser = Series([True]) + ser1 = Series([False]) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + assert ser.bool() + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + assert not ser1.bool() + + @pytest.mark.parametrize("data", [np.nan, pd.NaT, True, False]) + def test_nonzero_single_element_raise_1(self, data): + # single item nan to raise + series = Series([data]) + + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(series) + + @pytest.mark.parametrize("data", [np.nan, pd.NaT]) + def test_nonzero_single_element_raise_2(self, data): + msg_warn = ( + "Series.bool is now deprecated and will be removed " + "in future version of pandas" + ) + msg_err = "bool cannot act on a non-boolean single element Series" + series = Series([data]) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + with pytest.raises(ValueError, match=msg_err): + series.bool() + + @pytest.mark.parametrize("data", [(True, True), (False, False)]) + def test_nonzero_multiple_element_raise(self, data): + # multiple bool are still an error + msg_warn = ( + "Series.bool is now deprecated and will be removed " + "in future version of pandas" + ) + msg_err = "The truth value of a Series is ambiguous" + series = Series([data]) + with pytest.raises(ValueError, match=msg_err): + bool(series) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + with pytest.raises(ValueError, match=msg_err): + series.bool() + + @pytest.mark.parametrize("data", [1, 0, "a", 0.0]) + def test_nonbool_single_element_raise(self, data): + # single non-bool are an error + msg_warn = ( + "Series.bool is now deprecated and will be removed " + "in future version of pandas" + ) + msg_err1 = "The truth value of a Series is ambiguous" + msg_err2 = "bool cannot act on a non-boolean single element Series" + series = Series([data]) + with pytest.raises(ValueError, match=msg_err1): + bool(series) + with tm.assert_produces_warning(FutureWarning, match=msg_warn): + with pytest.raises(ValueError, match=msg_err2): + series.bool() + + def test_metadata_propagation_indiv_resample(self): + # resample + ts = Series( + np.random.default_rng(2).random(1000), + index=date_range("20130101", periods=1000, freq="s"), + name="foo", + ) + result = ts.resample("1min").mean() + tm.assert_metadata_equivalent(ts, result) + + result = ts.resample("1min").min() + tm.assert_metadata_equivalent(ts, result) + + result = ts.resample("1min").apply(lambda x: x.sum()) + tm.assert_metadata_equivalent(ts, result) + + def test_metadata_propagation_indiv(self, monkeypatch): + # check that the metadata matches up on the resulting ops + + ser = Series(range(3), range(3)) + ser.name = "foo" + ser2 = Series(range(3), range(3)) + ser2.name = "bar" + + result = ser.T + tm.assert_metadata_equivalent(ser, result) + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == "concat" and name == "filename": + value = "+".join( + [ + getattr(obj, name) + for obj in other.objs + if getattr(obj, name, None) + ] + ) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, None)) + + return self + + with monkeypatch.context() as m: + m.setattr(Series, "_metadata", ["name", "filename"]) + m.setattr(Series, "__finalize__", finalize) + + ser.filename = "foo" + ser2.filename = "bar" + + result = pd.concat([ser, ser2]) + assert result.filename == "foo+bar" + assert result.name is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_to_xarray.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_to_xarray.py new file mode 100644 index 0000000000000000000000000000000000000000..d8401a8b2ae3f3b885374375b64910e166dbe525 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/generic/test_to_xarray.py @@ -0,0 +1,130 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + +pytest.importorskip("xarray") + + +class TestDataFrameToXArray: + @pytest.fixture + def df(self): + return DataFrame( + { + "a": list("abcd"), + "b": list(range(1, 5)), + "c": np.arange(3, 7).astype("u1"), + "d": np.arange(4.0, 8.0, dtype="float64"), + "e": [True, False, True, False], + "f": Categorical(list("abcd")), + "g": date_range("20130101", periods=4), + "h": date_range("20130101", periods=4, tz="US/Eastern"), + } + ) + + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): + index = index_flat + # MultiIndex is tested in test_to_xarray_with_multiindex + if len(index) == 0: + pytest.skip("Test doesn't make sense for empty index") + + from xarray import Dataset + + df.index = index[:4] + df.index.name = "foo" + df.columns.name = "bar" + result = df.to_xarray() + assert result.sizes["foo"] == 4 + assert len(result.coords) == 1 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, Dataset) + + # idempotency + # datetimes w/tz are preserved + # column names are lost + expected = df.copy() + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) + expected.columns.name = None + tm.assert_frame_equal(result.to_dataframe(), expected) + + def test_to_xarray_empty(self, df): + from xarray import Dataset + + df.index.name = "foo" + result = df[0:0].to_xarray() + assert result.sizes["foo"] == 0 + assert isinstance(result, Dataset) + + def test_to_xarray_with_multiindex(self, df, using_infer_string): + from xarray import Dataset + + # MultiIndex + df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) + result = df.to_xarray() + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 + assert len(result.coords) == 2 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, Dataset) + + result = result.to_dataframe() + expected = df.copy() + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) + expected.columns.name = None + tm.assert_frame_equal(result, expected) + + +class TestSeriesToXArray: + def test_to_xarray_index_types(self, index_flat): + index = index_flat + # MultiIndex is tested in test_to_xarray_with_multiindex + + from xarray import DataArray + + ser = Series(range(len(index)), index=index, dtype="int64") + ser.index.name = "foo" + result = ser.to_xarray() + repr(result) + assert len(result) == len(index) + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + # idempotency + tm.assert_series_equal(result.to_series(), ser) + + def test_to_xarray_empty(self): + from xarray import DataArray + + ser = Series([], dtype=object) + ser.index.name = "foo" + result = ser.to_xarray() + assert len(result) == 0 + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + def test_to_xarray_with_multiindex(self): + from xarray import DataArray + + mi = MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"]) + ser = Series(range(6), dtype="int64", index=mi) + result = ser.to_xarray() + assert len(result) == 2 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, DataArray) + res = result.to_series() + tm.assert_series_equal(res, ser) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..287a232a0c4133e20993a7d52eadfd201da50168 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_all_methods.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_all_methods.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e68050e8ccfa128cbb1a1970d636b71fe3ff3d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_all_methods.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21062a221b4f4fd135f323e60d7af95081949aa8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fc2d244378422edc11a8612d917e0b870739da6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply_mutate.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply_mutate.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bf4f91d95db7d826ca140364f4b390c0f0e4cbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_apply_mutate.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_bin_groupby.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_bin_groupby.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5fdcec7f89f0fde5fddec74cd217ff0c0ea8542 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_bin_groupby.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_counting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_counting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7f10e6f63faf898831071037a8ba670eddba235 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_counting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_cumulative.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_cumulative.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc428aae3f98f252d6683d98e43f8571e4828dbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_cumulative.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_filters.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_filters.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6781cde62b529dc41caa87c4a79c1c05de2d22f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_filters.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_dropna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_dropna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd8f73b28da7155bf0df70af6ec784276fb73188 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_dropna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_subclass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_subclass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7746113a73bb3314f8c8dfcc06128db801c6f02 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_groupby_subclass.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..567a0736f6ab17c9a30ec9d2251fff33bcd74372 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_libgroupby.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_libgroupby.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..868da32115ab39f0232e79cf12bfa2ef6d5fae2a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_libgroupby.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e7939f66c6aed56016938ab3b5f5d105f05e707 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_numeric_only.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_numeric_only.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7be88c549d1f41e6cb3230c5d290d0fd2d0df90 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_numeric_only.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_pipe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_pipe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c7cc49693b0ea0d1422d12a09ebf11dd8630485 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_pipe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_raises.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_raises.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6353608d35e6a6fa2472510912706d4fb70620f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_raises.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_timegrouper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_timegrouper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8375a85b8bddc33c97e60cc0e6f258af14822266 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/__pycache__/test_timegrouper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85d6928962e57fe3b521e4167f8851b9a486899d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_aggregate.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_aggregate.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e6fc72b3689c0ee407cef05d80e6de8edc21881 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_aggregate.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_cython.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_cython.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ed3b865dfadebac44763952fb032a1d6ad44f9e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_cython.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_numba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_numba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e75d7c3b1dc2b85cb4a684bf3924fd19dc9bc239 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_numba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_other.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_other.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca5b447a535d5b27820767e668f62771e9cce244 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/aggregate/__pycache__/test_other.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b150dde143939ac23963c188dd1483ebdaf0817 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_corrwith.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_corrwith.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf92a26f3b872b6e8fd41eb1f2a678903686ba49 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_corrwith.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_describe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_describe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb77e3678226365318c44e0eba7d8ce700777b7a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_describe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_groupby_shift_diff.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_groupby_shift_diff.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..221e7e72890e809c40c1a036c7ced26141adfe33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_groupby_shift_diff.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_is_monotonic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_is_monotonic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..141c05d0164f33f556d7fb54ac62046fffb8b670 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_is_monotonic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nlargest_nsmallest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nlargest_nsmallest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fc7b08842f349db781d73985d0f7349e40a9245 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nlargest_nsmallest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nth.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nth.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81c6fe2087dde50cf413bb116ef819934dde9220 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_nth.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_quantile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_quantile.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..972d44b95cddcfde0da52d621dce52c2230ecbd3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_quantile.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_rank.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_rank.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dbf5b29e3a617b9b04ea24e615d87db73384287 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_rank.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_sample.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_sample.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df6588b9e00bd5a43d7260be4b53761b0fce71db Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_sample.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_size.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_size.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac90b532009e09ee222a137ac3a13feef9ed215f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_size.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_skew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_skew.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70092eb19ef96f85e3c5be42086cd997bd3625d3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_skew.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_value_counts.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_value_counts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25e414fa2d7805f7a06b153b42e6483135b655c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/methods/__pycache__/test_value_counts.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13bb711a208578dcbf6a9e0e4034465c8ed8b3b7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_numba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_numba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3886ab9b2fa1fca3429b0c201a543210b3f5e5b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_numba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_transform.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_transform.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d82575f23c2024e43312b5a5fa17ad4c77b1cca8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/groupby/transform/__pycache__/test_transform.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_reshape.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_reshape.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0ef676fd842590a3513f7f24361851d0ddd9f64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_reshape.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b7403b12bdaa137e20beb705cafffcbc3f08ded Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_append.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_append.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f940b84f9e4463ffe8d426fe850bb33b9f2cb2f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_append.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65177b83db48f5abb72fc54c7d9484ed6dfbca54 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_category.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_category.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cea3eb2e4b705c86f286d086d0cba29f8b84225f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_category.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05337deb9b2c5b61706ca17a6cfe7c633aa1b491 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_equals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_equals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9df4858d9db70b2a0f63b15890011fb792523885 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_equals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09cdcc71e77a1585920e52ff659671dda3a546da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..508d26e9fa513c1c82506e77f0c8d801547ab425 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61a5588aabed8aed29863d39459119c98997db85 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_map.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_map.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed30c80cf9d0b5b7e674bdc58e1461f70ca6e511 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_map.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_reindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_reindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0630eb4763da4d6656988dc83e778c5a595d7501 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_reindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e687c00168b438d4f0985acd1e74cfe85ae8b194 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_append.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_append.py new file mode 100644 index 0000000000000000000000000000000000000000..b48c3219f5111a7a1226d09ce4625c723c4168fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_append.py @@ -0,0 +1,62 @@ +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +class TestAppend: + @pytest.fixture + def ci(self): + categories = list("cab") + return CategoricalIndex(list("aabbca"), categories=categories, ordered=False) + + def test_append(self, ci): + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result, ci, exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result, ci, exact=True) + + def test_append_empty(self, ci): + # empty + result = ci.append([]) + tm.assert_index_equal(result, ci, exact=True) + + def test_append_mismatched_categories(self, ci): + # appending with different categories or reordered is not ok + msg = "all inputs must be Index" + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.set_categories(list("abcd"))) + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.reorder_categories(list("abc"))) + + def test_append_category_objects(self, ci): + # with objects + result = ci.append(Index(["c", "a"])) + expected = CategoricalIndex(list("aabbcaca"), categories=ci.categories) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_non_categories(self, ci): + # invalid objects -> cast to object via concat_compat + result = ci.append(Index(["a", "d"])) + expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"]) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_object(self, ci): + # GH#14298 - if base object is not categorical -> coerce to object + result = Index(["c", "a"]).append(ci) + expected = Index(list("caaabbca")) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_to_another(self): + # hits Index._concat + fst = Index(["a", "b"]) + snd = CategoricalIndex(["d", "e"]) + result = fst.append(snd) + expected = Index(["a", "b", "d", "e"]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..a17627b7515b26b1fcfdca0feec376f03a018e83 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_astype.py @@ -0,0 +1,90 @@ +from datetime import date + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + Index, + IntervalIndex, +) +import pandas._testing as tm + + +class TestAstype: + def test_astype(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = ci.astype(object) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) + + # this IS equal, but not the same class + assert result.equals(ci) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") + + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) + + result = ci.astype("interval") + expected = ii.take([0, 1, -1], allow_fill=True, fill_value=np.nan) + tm.assert_index_equal(result, expected) + + result = IntervalIndex(result.values) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) + def test_astype_category(self, name, dtype_ordered, index_ordered): + # GH#18630 + index = CategoricalIndex( + list("aabbca"), categories=list("cab"), ordered=index_ordered + ) + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = index.astype("category") + expected = index + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("box", [True, False]) + def test_categorical_date_roundtrip(self, box): + # astype to categorical and back should preserve date objects + v = date.today() + + obj = Index([v, v]) + assert obj.dtype == object + if box: + obj = obj.array + + cat = obj.astype("category") + + rtrip = cat.astype(object) + assert rtrip.dtype == object + assert type(rtrip[0]) is date diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_category.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_category.py new file mode 100644 index 0000000000000000000000000000000000000000..03a298a13dc2b45b3e78ec2d6390741709e42590 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_category.py @@ -0,0 +1,394 @@ +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import index as libindex +from pandas._libs.arrays import NDArrayBacked + +import pandas as pd +from pandas import ( + Categorical, + CategoricalDtype, +) +import pandas._testing as tm +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, +) + + +class TestCategoricalIndex: + @pytest.fixture + def simple_index(self) -> CategoricalIndex: + return CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + def test_can_hold_identifiers(self): + idx = CategoricalIndex(list("aabbca"), categories=None, ordered=False) + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is True + + def test_insert(self, simple_index): + ci = simple_index + categories = ci.categories + + # test 0th element + result = ci.insert(0, "a") + expected = CategoricalIndex(list("aaabbca"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # test Nth element that follows Python list behavior + result = ci.insert(-1, "a") + expected = CategoricalIndex(list("aabbcaa"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # test empty + result = CategoricalIndex([], categories=categories).insert(0, "a") + expected = CategoricalIndex(["a"], categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # invalid -> cast to object + expected = ci.astype(object).insert(0, "d") + result = ci.insert(0, "d").astype(object) + tm.assert_index_equal(result, expected, exact=True) + + # GH 18295 (test missing) + expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) + for na in (np.nan, pd.NaT, None): + result = CategoricalIndex(list("aabcb")).insert(1, na) + tm.assert_index_equal(result, expected) + + def test_insert_na_mismatched_dtype(self): + ci = CategoricalIndex([0, 1, 1]) + result = ci.insert(0, pd.NaT) + expected = Index([pd.NaT, 0, 1, 1], dtype=object) + tm.assert_index_equal(result, expected) + + def test_delete(self, simple_index): + ci = simple_index + categories = ci.categories + + result = ci.delete(0) + expected = CategoricalIndex(list("abbca"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + result = ci.delete(-1) + expected = CategoricalIndex(list("aabbc"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + with tm.external_error_raised((IndexError, ValueError)): + # Either depending on NumPy version + ci.delete(10) + + @pytest.mark.parametrize( + "data, non_lexsorted_data", + [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], + ) + def test_is_monotonic(self, data, non_lexsorted_data): + c = CategoricalIndex(data) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(data, ordered=True) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(data, categories=reversed(data)) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is True + + c = CategoricalIndex(data, categories=reversed(data), ordered=True) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is True + + # test when data is neither monotonic increasing nor decreasing + reordered_data = [data[0], data[2], data[1]] + c = CategoricalIndex(reordered_data, categories=reversed(data)) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is False + + # non lexsorted categories + categories = non_lexsorted_data + + c = CategoricalIndex(categories[:2], categories=categories) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(categories[1:3], categories=categories) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + def test_has_duplicates(self): + idx = CategoricalIndex([0, 0, 0], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True + + idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True + + idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo") + assert idx.is_unique is True + assert idx.has_duplicates is False + + @pytest.mark.parametrize( + "data, categories, expected", + [ + ( + [1, 1, 1], + [1, 2, 3], + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [1, 1, 1], + list("abc"), + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [2, "a", "b"], + list("abc"), + { + "first": np.zeros(shape=(3), dtype=np.bool_), + "last": np.zeros(shape=(3), dtype=np.bool_), + False: np.zeros(shape=(3), dtype=np.bool_), + }, + ), + ( + list("abb"), + list("abc"), + { + "first": np.array([False, False, True]), + "last": np.array([False, True, False]), + False: np.array([False, True, True]), + }, + ), + ], + ) + def test_drop_duplicates(self, data, categories, expected): + idx = CategoricalIndex(data, categories=categories, name="foo") + for keep, e in expected.items(): + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e) + e = idx[~e] + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, e) + + @pytest.mark.parametrize( + "data, categories, expected_data", + [ + ([1, 1, 1], [1, 2, 3], [1]), + ([1, 1, 1], list("abc"), [np.nan]), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"]), + ], + ) + def test_unique(self, data, categories, expected_data, ordered): + dtype = CategoricalDtype(categories, ordered=ordered) + + idx = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(expected_data, dtype=dtype) + tm.assert_index_equal(idx.unique(), expected) + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + def test_repr_roundtrip(self): + ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + str(ci) + tm.assert_index_equal(eval(repr(ci)), ci, exact=True) + + # formatting + str(ci) + + # long format + # this is not reprable + ci = CategoricalIndex(np.random.default_rng(2).integers(0, 5, size=100)) + str(ci) + + def test_isin(self): + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + tm.assert_numpy_array_equal( + ci.isin(["c"]), np.array([False, False, False, True, False, False]) + ) + tm.assert_numpy_array_equal( + ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False]) + ) + tm.assert_numpy_array_equal( + ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6) + ) + + # mismatched categorical -> coerced to ndarray so doesn't matter + result = ci.isin(ci.set_categories(list("abcdefghi"))) + expected = np.array([True] * 6) + tm.assert_numpy_array_equal(result, expected) + + result = ci.isin(ci.set_categories(list("defghi"))) + expected = np.array([False] * 5 + [True]) + tm.assert_numpy_array_equal(result, expected) + + def test_isin_overlapping_intervals(self): + # GH 34974 + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + result = CategoricalIndex(idx).isin(idx) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_identical(self): + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) + assert ci1.identical(ci1) + assert ci1.identical(ci1.copy()) + assert not ci1.identical(ci2) + + def test_ensure_copied_data(self): + # gh-12309: Check the "copy" argument of each + # Index.__new__ is honored. + # + # Must be tested separately from other indexes because + # self.values is not an ndarray. + index = CategoricalIndex(list("ab") * 5) + + result = CategoricalIndex(index.values, copy=True) + tm.assert_index_equal(index, result) + assert not np.shares_memory(result._data._codes, index._data._codes) + + result = CategoricalIndex(index.values, copy=False) + assert result._data._codes is index._data._codes + + +class TestCategoricalIndex2: + def test_view_i8(self): + # GH#25464 + ci = CategoricalIndex(list("ab") * 50) + msg = "When changing to a larger dtype, its size must be a divisor" + with pytest.raises(ValueError, match=msg): + ci.view("i8") + with pytest.raises(ValueError, match=msg): + ci._data.view("i8") + + ci = ci[:-4] # length divisible by 8 + + res = ci.view("i8") + expected = ci._data.codes.view("i8") + tm.assert_numpy_array_equal(res, expected) + + cat = ci._data + tm.assert_numpy_array_equal(cat.view("i8"), expected) + + @pytest.mark.parametrize( + "dtype, engine_type", + [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ], + ) + def test_engine_type(self, dtype, engine_type): + if dtype != np.int64: + # num. of uniques required to push CategoricalIndex.codes to a + # dtype (128 categories required for .codes dtype to be int16 etc.) + num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype] + ci = CategoricalIndex(range(num_uniques)) + else: + # having 2**32 - 2**31 categories would be very memory-intensive, + # so we cheat a bit with the dtype + ci = CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) + arr = ci.values._ndarray.astype("int64") + NDArrayBacked.__init__(ci._data, arr, ci.dtype) + assert np.issubdtype(ci.codes.dtype, dtype) + assert isinstance(ci._engine, engine_type) + + @pytest.mark.parametrize( + "func,op_name", + [ + (lambda idx: idx - idx, "__sub__"), + (lambda idx: idx + idx, "__add__"), + (lambda idx: idx - ["a", "b"], "__sub__"), + (lambda idx: idx + ["a", "b"], "__add__"), + (lambda idx: ["a", "b"] - idx, "__rsub__"), + (lambda idx: ["a", "b"] + idx, "__radd__"), + ], + ) + def test_disallow_addsub_ops(self, func, op_name): + # GH 10039 + # set ops (+/-) raise TypeError + idx = Index(Categorical(["a", "b"])) + cat_or_list = "'(Categorical|list)' and '(Categorical|list)'" + msg = "|".join( + [ + f"cannot perform {op_name} with this index type: CategoricalIndex", + "can only concatenate list", + rf"unsupported operand type\(s\) for [\+-]: {cat_or_list}", + ] + ) + with pytest.raises(TypeError, match=msg): + func(idx) + + def test_method_delegation(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.set_categories(list("cab")) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cab")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.rename_categories(list("efg")) + tm.assert_index_equal( + result, CategoricalIndex(list("ffggef"), categories=list("efg")) + ) + + # GH18862 (let rename_categories take callables) + result = ci.rename_categories(lambda x: x.upper()) + tm.assert_index_equal( + result, CategoricalIndex(list("AABBCA"), categories=list("CAB")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.add_categories(["d"]) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cabd")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.remove_categories(["c"]) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")), + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.as_unordered() + tm.assert_index_equal(result, ci) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.as_ordered() + tm.assert_index_equal( + result, + CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True), + ) + + # invalid + msg = "cannot use inplace with CategoricalIndex" + with pytest.raises(ValueError, match=msg): + ci.set_categories(list("cab"), inplace=True) + + def test_remove_maintains_order(self): + ci = CategoricalIndex(list("abcdda"), categories=list("abcd")) + result = ci.reorder_categories(["d", "c", "b", "a"], ordered=True) + tm.assert_index_equal( + result, + CategoricalIndex(list("abcdda"), categories=list("dcba"), ordered=True), + ) + result = result.remove_categories(["c"]) + tm.assert_index_equal( + result, + CategoricalIndex( + ["a", "b", np.nan, "d", "d", "a"], categories=list("dba"), ordered=True + ), + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..f0c5307fc5c641ff25d26bd2bd8a158b43dd6a6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_constructors.py @@ -0,0 +1,142 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +class TestCategoricalIndexConstructors: + def test_construction_disallows_scalar(self): + msg = "must be called with a collection of some kind" + with pytest.raises(TypeError, match=msg): + CategoricalIndex(data=1, categories=list("abcd"), ordered=False) + with pytest.raises(TypeError, match=msg): + CategoricalIndex(categories=list("abcd"), ordered=False) + + def test_construction(self): + ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + result = Index(ci.values) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + # empty + result = CategoricalIndex([], categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) + assert not result.ordered + + # passing categories + result = CategoricalIndex(list("aabbca"), categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + + c = Categorical(list("aabbca")) + result = CategoricalIndex(c) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(c, categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + ci = CategoricalIndex(c, categories=list("abcd")) + result = CategoricalIndex(ci) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) + tm.assert_index_equal(result, expected, exact=True) + + # turn me to an Index + result = Index(np.array(ci)) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + # specify dtype + ci = CategoricalIndex(list("aabbca"), categories=list("abc"), ordered=False) + + result = Index(np.array(ci), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + result = Index(np.array(ci).tolist(), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + # these are generally only equal when the categories are reordered + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) + tm.assert_index_equal(result, ci, exact=True) + + # make sure indexes are handled + idx = Index(range(3)) + expected = CategoricalIndex([0, 1, 2], categories=idx, ordered=True) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_construction_empty_with_bool_categories(self): + # see GH#22702 + cat = CategoricalIndex([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH#18109 + data, cats, ordered = "a a b b".split(), "c b a".split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(data, categories=cats, ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # GH#19032 + result = Index(data, dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + # error when combining categories/ordered and dtype kwargs + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, ordered=ordered, dtype=dtype) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_equals.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_equals.py new file mode 100644 index 0000000000000000000000000000000000000000..a8353f301a3c39a50b2a0c5541722551ff660e30 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_equals.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalIndex, + Index, + MultiIndex, +) + + +class TestEquals: + def test_equals_categorical(self): + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) + + assert ci1.equals(ci1) + assert not ci1.equals(ci2) + assert ci1.equals(ci1.astype(object)) + assert ci1.astype(object).equals(ci1) + + assert (ci1 == ci1).all() + assert not (ci1 != ci1).all() + assert not (ci1 > ci1).all() + assert not (ci1 < ci1).all() + assert (ci1 <= ci1).all() + assert (ci1 >= ci1).all() + + assert not (ci1 == 1).all() + assert (ci1 == Index(["a", "b"])).all() + assert (ci1 == ci1.values).all() + + # invalid comparisons + with pytest.raises(ValueError, match="Lengths must match"): + ci1 == Index(["a", "b", "c"]) + + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): + ci1 == ci2 + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, ordered=False) + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, categories=list("abc")) + + # tests + # make sure that we are testing for category inclusion properly + ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + # Same categories, but different order + # Unordered + assert ci.equals(CategoricalIndex(list("aabca"))) + # Ordered + assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) + assert ci.equals(ci.copy()) + + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + assert not ci.equals(CategoricalIndex(list("aabca"))) + assert ci.equals(ci.copy()) + + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca") + [np.nan]) + assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) + assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) + assert ci.equals(ci.copy()) + + def test_equals_categorical_unordered(self): + # https://github.com/pandas-dev/pandas/issues/16603 + a = CategoricalIndex(["A"], categories=["A", "B"]) + b = CategoricalIndex(["A"], categories=["B", "A"]) + c = CategoricalIndex(["C"], categories=["B", "A"]) + assert a.equals(b) + assert not a.equals(c) + assert not b.equals(c) + + def test_equals_non_category(self): + # GH#37667 Case where other contains a value not among ci's + # categories ("D") and also contains np.nan + ci = CategoricalIndex(["A", "B", np.nan, np.nan]) + other = Index(["A", "B", "D", np.nan]) + + assert not ci.equals(other) + + def test_equals_multiindex(self): + # dont raise NotImplementedError when calling is_dtype_compat + + mi = MultiIndex.from_arrays([["A", "B", "C", "D"], range(4)]) + ci = mi.to_flat_index().astype("category") + + assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..09de578f3c649e5a90278f11b1e3cd5b1d0646d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_fillna.py @@ -0,0 +1,54 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_categorical(self): + # GH#11343 + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") + # fill by value in categories + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") + tm.assert_index_equal(idx.fillna(1.0), exp) + + cat = idx._data + + # fill by value not in categories raises TypeError on EA, casts on CI + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + cat.fillna(2.0) + + result = idx.fillna(2.0) + expected = idx.astype(object).fillna(2.0) + tm.assert_index_equal(result, expected) + + def test_fillna_copies_with_no_nas(self): + # Nothing to fill, should still get a copy for the Categorical method, + # but OK to get a view on CategoricalIndex method + ci = CategoricalIndex([0, 1, 1]) + result = ci.fillna(0) + assert result is not ci + assert tm.shares_memory(result, ci) + + # But at the EA level we always get a copy. + cat = ci._data + result = cat.fillna(0) + assert result._ndarray is not cat._ndarray + assert result._ndarray.base is None + assert not tm.shares_memory(result, cat) + + def test_fillna_validates_with_no_nas(self): + # We validate the fill value even if fillna is a no-op + ci = CategoricalIndex([2, 3, 3]) + cat = ci._data + + msg = "Cannot setitem on a Categorical with a new category" + res = ci.fillna(False) + # nothing to fill, so we dont cast + tm.assert_index_equal(res, ci) + + # Same check directly on the Categorical + with pytest.raises(TypeError, match=msg): + cat.fillna(False) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_formats.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..522ca1bc2afde451e2d6feff780a973aceb4c39f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_formats.py @@ -0,0 +1,120 @@ +""" +Tests for CategoricalIndex.__repr__ and related methods. +""" +import pytest + +from pandas._config import using_pyarrow_string_dtype +import pandas._config.config as cf + +from pandas import CategoricalIndex +import pandas._testing as tm + + +class TestCategoricalIndexRepr: + def test_format_different_scalar_lengths(self): + # GH#35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + msg = r"CategoricalIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + def test_string_categorical_index_repr(self): + # short + idx = CategoricalIndex(["a", "bb", "ccc"]) + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + assert repr(idx) == expected + + # multiple lines + idx = CategoricalIndex(["a", "bb", "ccc"] * 10) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected + + # truncated + idx = CategoricalIndex(["a", "bb", "ccc"] * 100) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 + + assert repr(idx) == expected + + # larger categories + idx = CategoricalIndex(list("abcdefghijklmmo")) + expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'm', 'o'], + categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected + + # short + idx = CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + assert repr(idx) == expected + + # multiple lines + idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected + + # truncated + idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + + assert repr(idx) == expected + + # larger categories + idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', + 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected + + # Enable Unicode option ----------------------------------------- + with cf.option_context("display.unicode.east_asian_width", True): + # short + idx = CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + assert repr(idx) == expected + + # multiple lines + idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected + + # truncated + idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + + assert repr(idx) == expected + + # larger categories + idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', + 'さ', 'し', 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(idx) == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..49eb79da616e7603b70ee3189e9004dd51fb33e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_indexing.py @@ -0,0 +1,420 @@ +import numpy as np +import pytest + +from pandas.errors import InvalidIndexError + +import pandas as pd +from pandas import ( + CategoricalIndex, + Index, + IntervalIndex, + Timestamp, +) +import pandas._testing as tm + + +class TestTake: + def test_take_fill_value(self): + # GH 12631 + + # numeric category + idx = CategoricalIndex([1, 2, 3], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # object category + idx = CategoricalIndex( + list("CBA"), categories=list("ABC"), ordered=True, name="xxx" + ) + result = idx.take(np.array([1, 0, -1])) + expected = CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = CategoricalIndex( + ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + def test_take_fill_value_datetime(self): + # datetime category + idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") + idx = CategoricalIndex(idx) + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + expected = CategoricalIndex(expected, categories=exp_cats) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + def test_take_invalid_kwargs(self): + idx = CategoricalIndex([1, 2, 3], name="foo") + indices = [1, 0, -1] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +class TestGetLoc: + def test_get_loc(self): + # GH 12531 + cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) + idx1 = Index(list("abcde")) + assert cidx1.get_loc("a") == idx1.get_loc("a") + assert cidx1.get_loc("e") == idx1.get_loc("e") + + for i in [cidx1, idx1]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique + cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) + idx2 = Index(list("aacded")) + + # results in bool array + res = cidx2.get_loc("d") + tm.assert_numpy_array_equal(res, idx2.get_loc("d")) + tm.assert_numpy_array_equal( + res, np.array([False, False, False, True, False, True]) + ) + # unique element results in scalar + res = cidx2.get_loc("e") + assert res == idx2.get_loc("e") + assert res == 4 + + for i in [cidx2, idx2]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique, sliceable + cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) + idx3 = Index(list("aabbb")) + + # results in slice + res = cidx3.get_loc("a") + assert res == idx3.get_loc("a") + assert res == slice(0, 2, None) + + res = cidx3.get_loc("b") + assert res == idx3.get_loc("b") + assert res == slice(2, 5, None) + + for i in [cidx3, idx3]: + with pytest.raises(KeyError, match="'c'"): + i.get_loc("c") + + def test_get_loc_unique(self): + cidx = CategoricalIndex(list("abc")) + result = cidx.get_loc("b") + assert result == 1 + + def test_get_loc_monotonic_nonunique(self): + cidx = CategoricalIndex(list("abbc")) + result = cidx.get_loc("b") + expected = slice(1, 3, None) + assert result == expected + + def test_get_loc_nonmonotonic_nonunique(self): + cidx = CategoricalIndex(list("abcb")) + result = cidx.get_loc("b") + expected = np.array([False, True, False, True], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_nan(self): + # GH#41933 + ci = CategoricalIndex(["A", "B", np.nan]) + res = ci.get_loc(np.nan) + + assert res == 2 + + +class TestGetIndexer: + def test_get_indexer_base(self): + # Determined by cat ordering. + idx = CategoricalIndex(list("cab"), categories=list("cab")) + expected = np.arange(len(idx), dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + def test_get_indexer_requires_unique(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + oidx = Index(np.array(ci)) + + msg = "Reindexing only valid with uniquely valued Index objects" + + for n in [1, 2, 5, len(ci)]: + finder = oidx[np.random.default_rng(2).integers(0, len(ci), size=n)] + + with pytest.raises(InvalidIndexError, match=msg): + ci.get_indexer(finder) + + # see gh-17323 + # + # Even when indexer is equal to the + # members in the index, we should + # respect duplicates instead of taking + # the fast-track path. + for finder in [list("aabbca"), list("aababca")]: + with pytest.raises(InvalidIndexError, match=msg): + ci.get_indexer(finder) + + def test_get_indexer_non_unique(self): + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) + + for indexer in [idx2, list("abf"), Index(list("abf"))]: + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + idx1.get_indexer(indexer) + + r1, _ = idx1.get_indexer_non_unique(indexer) + expected = np.array([0, 1, 2, -1], dtype=np.intp) + tm.assert_almost_equal(r1, expected) + + def test_get_indexer_method(self): + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) + + msg = "method pad not yet implemented for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="pad") + msg = "method backfill not yet implemented for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="backfill") + + msg = "method nearest not yet implemented for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="nearest") + + def test_get_indexer_array(self): + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") + result = ci.get_indexer(arr) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_nans_in_index_and_target(self): + # GH 45361 + ci = CategoricalIndex([1, 2, np.nan, 3]) + other1 = [2, 3, 4, np.nan] + res1 = ci.get_indexer(other1) + expected1 = np.array([1, 3, -1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res1, expected1) + other2 = [1, 4, 2, 3] + res2 = ci.get_indexer(other2) + expected2 = np.array([0, -1, 1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(res2, expected2) + + +class TestWhere: + def test_where(self, listlike_box): + klass = listlike_box + + i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_where_non_categories(self): + ci = CategoricalIndex(["a", "b", "c", "d"]) + mask = np.array([True, False, True, False]) + + result = ci.where(mask, 2) + expected = Index(["a", 2, "c", 2], dtype=object) + tm.assert_index_equal(result, expected) + + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + # Test the Categorical method directly + ci._data._where(mask, 2) + + +class TestContains: + def test_contains(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False) + + assert "a" in ci + assert "z" not in ci + assert "e" not in ci + assert np.nan not in ci + + # assert codes NOT in index + assert 0 not in ci + assert 1 not in ci + + def test_contains_nan(self): + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) + assert np.nan in ci + + @pytest.mark.parametrize("unwrap", [True, False]) + def test_contains_na_dtype(self, unwrap): + dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT) + pi = dti.to_period("D") + tdi = dti - dti[-1] + ci = CategoricalIndex(dti) + + obj = ci + if unwrap: + obj = ci._data + + assert np.nan in obj + assert None in obj + assert pd.NaT in obj + assert np.datetime64("NaT") in obj + assert np.timedelta64("NaT") not in obj + + obj2 = CategoricalIndex(tdi) + if unwrap: + obj2 = obj2._data + + assert np.nan in obj2 + assert None in obj2 + assert pd.NaT in obj2 + assert np.datetime64("NaT") not in obj2 + assert np.timedelta64("NaT") in obj2 + + obj3 = CategoricalIndex(pi) + if unwrap: + obj3 = obj3._data + + assert np.nan in obj3 + assert None in obj3 + assert pd.NaT in obj3 + assert np.datetime64("NaT") not in obj3 + assert np.timedelta64("NaT") not in obj3 + + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + + def test_contains_list(self): + # GH#21729 + idx = CategoricalIndex([1, 2, 3]) + + assert "a" not in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in idx diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_map.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..baf836594dfb5e03332b57522f39a679ee5b1e40 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_map.py @@ -0,0 +1,144 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "data, categories", + [ + (list("abcbca"), list("cab")), + (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)), + ], + ids=["string", "interval"], +) +def test_map_str(data, categories, ordered): + # GH 31202 - override base class since we want to maintain categorical/ordered + index = CategoricalIndex(data, categories=categories, ordered=ordered) + result = index.map(str) + expected = CategoricalIndex( + map(str, data), categories=map(str, categories), ordered=ordered + ) + tm.assert_index_equal(result, expected) + + +def test_map(): + ci = CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) + result = ci.map(lambda x: x.lower()) + exp = CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_index_equal(result, exp) + + ci = CategoricalIndex( + list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" + ) + result = ci.map(lambda x: x.lower()) + exp = CategoricalIndex( + list("ababc"), categories=list("bac"), ordered=False, name="XXX" + ) + tm.assert_index_equal(result, exp) + + # GH 12766: Return an index not an array + tm.assert_index_equal( + ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") + ) + + # change categories dtype + ci = CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) + + def f(x): + return {"A": 10, "B": 20, "C": 30}.get(x) + + result = ci.map(f) + exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) + tm.assert_index_equal(result, exp) + + result = ci.map(Series([10, 20, 30], index=["A", "B", "C"])) + tm.assert_index_equal(result, exp) + + result = ci.map({"A": 10, "B": 20, "C": 30}) + tm.assert_index_equal(result, exp) + + +def test_map_with_categorical_series(): + # GH 12756 + a = Index([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + + +@pytest.mark.parametrize( + ("data", "f", "expected"), + ( + ([1, 1, np.nan], pd.isna, CategoricalIndex([False, False, np.nan])), + ([1, 2, np.nan], pd.isna, Index([False, False, np.nan])), + ([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])), + ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])), + ( + [1, 1, np.nan], + Series([False, False]), + CategoricalIndex([False, False, np.nan]), + ), + ( + [1, 2, np.nan], + Series([False, False, False]), + Index([False, False, np.nan]), + ), + ), +) +def test_map_with_nan_ignore(data, f, expected): # GH 24241 + values = CategoricalIndex(data) + result = values.map(f, na_action="ignore") + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + ("data", "f", "expected"), + ( + ([1, 1, np.nan], pd.isna, Index([False, False, True])), + ([1, 2, np.nan], pd.isna, Index([False, False, True])), + ([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])), + ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])), + ( + [1, 1, np.nan], + Series([False, False]), + CategoricalIndex([False, False, np.nan]), + ), + ( + [1, 2, np.nan], + Series([False, False, False]), + Index([False, False, np.nan]), + ), + ), +) +def test_map_with_nan_none(data, f, expected): # GH 24241 + values = CategoricalIndex(data) + result = values.map(f, na_action=None) + tm.assert_index_equal(result, expected) + + +def test_map_with_dict_or_series(): + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = CategoricalIndex(orig_values, name="XXX") + expected = CategoricalIndex(new_values, name="XXX", categories=[3.0, 2, "one"]) + + mapper = Series(new_values[:-1], index=orig_values[:-1]) + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) + + mapper = dict(zip(orig_values[:-1], new_values[:-1])) + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_reindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_reindex.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1f2b9fb159a6873c83e0a0a4e777913bb99fee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_reindex.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalIndex, + Index, + Interval, +) +import pandas._testing as tm + + +class TestReindex: + def test_reindex_list_non_unique(self): + # GH#11586 + msg = "cannot reindex on an axis with duplicate labels" + ci = CategoricalIndex(["a", "b", "c", "a"]) + with pytest.raises(ValueError, match=msg): + ci.reindex(["a", "c"]) + + def test_reindex_categorical_non_unique(self): + msg = "cannot reindex on an axis with duplicate labels" + ci = CategoricalIndex(["a", "b", "c", "a"]) + with pytest.raises(ValueError, match=msg): + ci.reindex(Categorical(["a", "c"])) + + def test_reindex_list_non_unique_unused_category(self): + msg = "cannot reindex on an axis with duplicate labels" + ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + ci.reindex(["a", "c"]) + + def test_reindex_categorical_non_unique_unused_category(self): + msg = "cannot reindex on an axis with duplicate labels" + ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + ci.reindex(Categorical(["a", "c"])) + + def test_reindex_duplicate_target(self): + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + def test_reindex_empty_index(self): + # See GH16770 + c = CategoricalIndex([]) + res, indexer = c.reindex(["a", "b"]) + tm.assert_index_equal(res, Index(["a", "b"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) + + def test_reindex_categorical_added_category(self): + # GH 42424 + ci = CategoricalIndex( + [Interval(0, 1, closed="right"), Interval(1, 2, closed="right")], + ordered=True, + ) + ci_add = CategoricalIndex( + [ + Interval(0, 1, closed="right"), + Interval(1, 2, closed="right"), + Interval(2, 3, closed="right"), + Interval(3, 4, closed="right"), + ], + ordered=True, + ) + result, _ = ci.reindex(ci_add) + expected = ci_add + tm.assert_index_equal(expected, result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..2e87b90efd54c8fcc4dcab7ec538d461add370de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ed875f74bdcdd1fa64d430465c72c09c2744592 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_drop_duplicates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_drop_duplicates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8a2e9121ea91011b5cf2cf992607f10c5d55478 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_drop_duplicates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_equals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_equals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41687e1d2b825dcad8c1c2b4c98389af39a0c178 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_equals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36b45899698218b407052bd0fc19ea6b0a85fed4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_is_monotonic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_is_monotonic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb933945e2470cad03b2cb3c69aa28983a2872b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_is_monotonic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_nat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_nat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c85944d5a7a95a00ea79913177c2e4c2302c70b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_nat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_sort_values.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_sort_values.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a65e9d17054b6bf94e515a91a7d0d9762a5ea853 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_sort_values.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_value_counts.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_value_counts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbb7baebe1eb6b51ce48989efd68a2c409447296 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/__pycache__/test_value_counts.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..61a79c4ceabf9d68aab73ffd69e0f15ad842ff74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + Series, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class DropDuplicates: + def test_drop_duplicates_metadata(self, idx): + # GH#10115 + result = idx.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + idx_dup = idx.append(idx) + result = idx_dup.drop_duplicates() + + expected = idx + if not isinstance(idx, PeriodIndex): + # freq is reset except for PeriodIndex + assert idx_dup.freq is None + assert result.freq is None + expected = idx._with_freq(None) + else: + assert result.freq == expected.freq + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "keep, expected, index", + [ + ( + "first", + np.concatenate(([False] * 10, [True] * 5)), + np.arange(0, 10, dtype=np.int64), + ), + ( + "last", + np.concatenate(([True] * 5, [False] * 10)), + np.arange(5, 15, dtype=np.int64), + ), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10, dtype=np.int64), + ), + ], + ) + def test_drop_duplicates(self, keep, expected, index, idx): + # to check Index/Series compat + idx = idx.append(idx[:5]) + + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] + + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) + + result = Series(idx).drop_duplicates(keep=keep) + expected = Series(expected, index=index) + tm.assert_series_equal(result, expected) + + +class TestDropDuplicatesPeriodIndex(DropDuplicates): + @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) + def freq(self, request): + return request.param + + @pytest.fixture + def idx(self, freq): + return period_range("2011-01-01", periods=10, freq=freq, name="idx") + + +class TestDropDuplicatesDatetimeIndex(DropDuplicates): + @pytest.fixture + def idx(self, freq_sample): + return date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") + + +class TestDropDuplicatesTimedeltaIndex(DropDuplicates): + @pytest.fixture + def idx(self, freq_sample): + return timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_equals.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_equals.py new file mode 100644 index 0000000000000000000000000000000000000000..fc9fbd33d0d285fe7635c23c598318208bb58561 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_equals.py @@ -0,0 +1,181 @@ +""" +Tests shared for DatetimeIndex/TimedeltaIndex/PeriodIndex +""" +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + DatetimeIndex, + Index, + PeriodIndex, + TimedeltaIndex, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class EqualsTests: + def test_not_equals_numeric(self, index): + assert not index.equals(Index(index.asi8)) + assert not index.equals(Index(index.asi8.astype("u8"))) + assert not index.equals(Index(index.asi8).astype("f8")) + + def test_equals(self, index): + assert index.equals(index) + assert index.equals(index.astype(object)) + assert index.equals(CategoricalIndex(index)) + assert index.equals(CategoricalIndex(index.astype(object))) + + def test_not_equals_non_arraylike(self, index): + assert not index.equals(list(index)) + + def test_not_equals_strings(self, index): + other = Index([str(x) for x in index], dtype=object) + assert not index.equals(other) + assert not index.equals(CategoricalIndex(other)) + + def test_not_equals_misc_strs(self, index): + other = Index(list("abc")) + assert not index.equals(other) + + +class TestPeriodIndexEquals(EqualsTests): + @pytest.fixture + def index(self): + return period_range("2013-01-01", periods=5, freq="D") + + # TODO: de-duplicate with other test_equals2 methods + @pytest.mark.parametrize("freq", ["D", "M"]) + def test_equals2(self, freq): + # GH#13107 + idx = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq=freq) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="h") + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = PeriodIndex._simple_new( + idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("h")) + ) + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) + + +class TestDatetimeIndexEquals(EqualsTests): + @pytest.fixture + def index(self): + return date_range("2013-01-01", periods=5) + + def test_equals2(self): + # GH#13107 + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"]) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"], tz="US/Pacific") + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = DatetimeIndex(idx.asi8, tz="US/Pacific") + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) + + # check that we do not raise when comparing with OutOfBounds objects + oob = Index([datetime(2500, 1, 1)] * 3, dtype=object) + assert not idx.equals(oob) + assert not idx2.equals(oob) + assert not idx3.equals(oob) + + # check that we do not raise when comparing with OutOfBounds dt64 + oob2 = oob.map(np.datetime64) + assert not idx.equals(oob2) + assert not idx2.equals(oob2) + assert not idx3.equals(oob2) + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_not_equals_bday(self, freq): + rng = date_range("2009-01-01", "2010-01-01", freq=freq) + assert not rng.equals(list(rng)) + + +class TestTimedeltaIndexEquals(EqualsTests): + @pytest.fixture + def index(self): + return timedelta_range("1 day", periods=10) + + def test_equals2(self): + # GH#13107 + idx = TimedeltaIndex(["1 days", "2 days", "NaT"]) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = TimedeltaIndex(["2 days", "1 days", "NaT"]) + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.astype(object).equals(idx2.astype(object)) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # Check that we dont raise OverflowError on comparisons outside the + # implementation range GH#28532 + oob = Index([timedelta(days=10**6)] * 3, dtype=object) + assert not idx.equals(oob) + assert not idx2.equals(oob) + + oob2 = Index([np.timedelta64(x) for x in oob], dtype=object) + assert (oob == oob2).all() + assert not idx.equals(oob2) + assert not idx2.equals(oob2) + + oob3 = oob.map(np.timedelta64) + assert (oob3 == oob).all() + assert not idx.equals(oob3) + assert not idx2.equals(oob3) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..7b2c81aaf17de3785e62a2d989394259b2496085 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, +) +import pandas._testing as tm + +dtlike_dtypes = [ + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + pd.DatetimeTZDtype("ns", "Asia/Tokyo"), + pd.PeriodDtype("ns"), +] + + +@pytest.mark.parametrize("ldtype", dtlike_dtypes) +@pytest.mark.parametrize("rdtype", dtlike_dtypes) +def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) + + def construct(dtype): + if dtype is dtlike_dtypes[-1]: + # PeriodArray will try to cast ints to strings + return DatetimeIndex(vals).astype(dtype) + return Index(vals, dtype=dtype) + + left = construct(ldtype) + right = construct(rdtype) + + result = left.get_indexer_non_unique(right) + + if ldtype is rdtype: + ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) + ex2 = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], ex1) + tm.assert_numpy_array_equal(result[1], ex2) + + else: + no_matches = np.array([-1] * 6, dtype=np.intp) + missing = np.arange(6, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], no_matches) + tm.assert_numpy_array_equal(result[1], missing) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_is_monotonic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_is_monotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..b0e42e660b751cddaca74c4574e9588e8ac8c782 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_is_monotonic.py @@ -0,0 +1,46 @@ +from pandas import ( + Index, + NaT, + date_range, +) + + +def test_is_monotonic_with_nat(): + # GH#31437 + # PeriodIndex.is_monotonic_increasing should behave analogously to DatetimeIndex, + # in particular never be monotonic when we have NaT + dti = date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + tdi = Index(dti.view("timedelta64[ns]")) + + for obj in [pi, pi._engine, dti, dti._engine, tdi, tdi._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert obj.is_monotonic_increasing + assert obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique + + dti1 = dti.insert(0, NaT) + pi1 = dti1.to_period("D") + tdi1 = Index(dti1.view("timedelta64[ns]")) + + for obj in [pi1, pi1._engine, dti1, dti1._engine, tdi1, tdi1._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique + + dti2 = dti.insert(3, NaT) + pi2 = dti2.to_period("h") + tdi2 = Index(dti2.view("timedelta64[ns]")) + + for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_nat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_nat.py new file mode 100644 index 0000000000000000000000000000000000000000..50cf29d0163555876eb7b1914bbd4ee45bc2285e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_nat.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + TimedeltaIndex, +) +import pandas._testing as tm + + +class NATests: + def test_nat(self, index_without_na): + empty_index = index_without_na[:0] + + index_with_na = index_without_na.copy(deep=True) + index_with_na._data[1] = NaT + + assert empty_index._na_value is NaT + assert index_with_na._na_value is NaT + assert index_without_na._na_value is NaT + + idx = index_without_na + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + + idx = index_with_na + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True + + +class TestDatetimeIndexNA(NATests): + @pytest.fixture + def index_without_na(self, tz_naive_fixture): + tz = tz_naive_fixture + return DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + + +class TestTimedeltaIndexNA(NATests): + @pytest.fixture + def index_without_na(self): + return TimedeltaIndex(["1 days", "2 days"]) + + +class TestPeriodIndexNA(NATests): + @pytest.fixture + def index_without_na(self): + return PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_sort_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_sort_values.py new file mode 100644 index 0000000000000000000000000000000000000000..a2c349c8b0ef679b9d32411efd8a0d393b9d5e9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -0,0 +1,315 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Index, + NaT, + PeriodIndex, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +def check_freq_ascending(ordered, orig, ascending): + """ + Check the expected freq on a PeriodIndex/DatetimeIndex/TimedeltaIndex + when the original index is generated (or generate-able) with + period_range/date_range/timedelta_range. + """ + if isinstance(ordered, PeriodIndex): + assert ordered.freq == orig.freq + elif isinstance(ordered, (DatetimeIndex, TimedeltaIndex)): + if ascending: + assert ordered.freq.n == orig.freq.n + else: + assert ordered.freq.n == -1 * orig.freq.n + + +def check_freq_nonmonotonic(ordered, orig): + """ + Check the expected freq on a PeriodIndex/DatetimeIndex/TimedeltaIndex + when the original index is _not_ generated (or generate-able) with + period_range/date_range//timedelta_range. + """ + if isinstance(ordered, PeriodIndex): + assert ordered.freq == orig.freq + elif isinstance(ordered, (DatetimeIndex, TimedeltaIndex)): + assert ordered.freq is None + + +class TestSortValues: + @pytest.fixture(params=[DatetimeIndex, TimedeltaIndex, PeriodIndex]) + def non_monotonic_idx(self, request): + if request.param is DatetimeIndex: + return DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) + elif request.param is PeriodIndex: + dti = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) + return dti.to_period("D") + else: + return TimedeltaIndex( + ["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"] + ) + + def test_argmin_argmax(self, non_monotonic_idx): + assert non_monotonic_idx.argmin() == 1 + assert non_monotonic_idx.argmax() == 0 + + def test_sort_values(self, non_monotonic_idx): + idx = non_monotonic_idx + ordered = idx.sort_values() + assert ordered.is_monotonic_increasing + ordered = idx.sort_values(ascending=False) + assert ordered[::-1].is_monotonic_increasing + + ordered, dexer = idx.sort_values(return_indexer=True) + assert ordered.is_monotonic_increasing + tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0], dtype=np.intp)) + + ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) + assert ordered[::-1].is_monotonic_increasing + tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1], dtype=np.intp)) + + def check_sort_values_with_freq(self, idx): + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + check_freq_ascending(ordered, idx, True) + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + check_freq_ascending(ordered, idx, False) + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2], dtype=np.intp)) + check_freq_ascending(ordered, idx, True) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0], dtype=np.intp)) + check_freq_ascending(ordered, idx, False) + + @pytest.mark.parametrize("freq", ["D", "h"]) + def test_sort_values_with_freq_timedeltaindex(self, freq): + # GH#10295 + idx = timedelta_range(start=f"1{freq}", periods=3, freq=freq).rename("idx") + + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize( + "idx", + [ + DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" + ), + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="h", + name="tzidx", + tz="Asia/Tokyo", + ), + ], + ) + def test_sort_values_with_freq_datetimeindex(self, idx): + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize("freq", ["D", "2D", "4D"]) + def test_sort_values_with_freq_periodindex(self, freq): + # here with_freq refers to being period_range-like + idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" + ) + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize( + "idx", + [ + PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="Y"), + Index([2011, 2012, 2013], name="idx"), # for compatibility check + ], + ) + def test_sort_values_with_freq_periodindex2(self, idx): + # here with_freq indicates this is period_range-like + self.check_sort_values_with_freq(idx) + + def check_sort_values_without_freq(self, idx, expected): + ordered = idx.sort_values(na_position="first") + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + if not idx.isna().any(): + ordered = idx.sort_values() + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + check_freq_nonmonotonic(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, na_position="first") + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + if not idx.isna().any(): + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 0, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + def test_sort_values_without_freq_timedeltaindex(self): + # GH#10295 + + idx = TimedeltaIndex( + ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" + ) + expected = TimedeltaIndex( + ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" + ) + self.check_sort_values_without_freq(idx, expected) + + @pytest.mark.parametrize( + "index_dates,expected_dates", + [ + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ], + ) + def test_sort_values_without_freq_datetimeindex( + self, index_dates, expected_dates, tz_naive_fixture + ): + tz = tz_naive_fixture + + # without freq + idx = DatetimeIndex(index_dates, tz=tz, name="idx") + expected = DatetimeIndex(expected_dates, tz=tz, name="idx") + + self.check_sort_values_without_freq(idx, expected) + + @pytest.mark.parametrize( + "idx,expected", + [ + ( + PeriodIndex( + [ + "2011-01-01", + "2011-01-03", + "2011-01-05", + "2011-01-02", + "2011-01-01", + ], + freq="D", + name="idx1", + ), + PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-03", + "2011-01-05", + ], + freq="D", + name="idx1", + ), + ), + ( + PeriodIndex( + [ + "2011-01-01", + "2011-01-03", + "2011-01-05", + "2011-01-02", + "2011-01-01", + ], + freq="D", + name="idx2", + ), + PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-03", + "2011-01-05", + ], + freq="D", + name="idx2", + ), + ), + ( + PeriodIndex( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], + freq="D", + name="idx3", + ), + PeriodIndex( + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx3", + ), + ), + ( + PeriodIndex( + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y" + ), + PeriodIndex( + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="Y" + ), + ), + ( + # For compatibility check + Index([2011, 2013, 2015, 2012, 2011], name="idx"), + Index([2011, 2011, 2012, 2013, 2015], name="idx"), + ), + ], + ) + def test_sort_values_without_freq_periodindex(self, idx, expected): + # here without_freq means not generateable by period_range + self.check_sort_values_without_freq(idx, expected) + + def test_sort_values_without_freq_periodindex_nat(self): + # doesn't quite fit into check_sort_values_without_freq + idx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") + expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") + + ordered = idx.sort_values(na_position="first") + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + check_freq_nonmonotonic(ordered, idx) + + +def test_order_stability_compat(): + # GH#35922. sort_values is stable both for normal and datetime-like Index + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_value_counts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_value_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..069e354a364c9343c595da9d18ee7c04eec04f43 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -0,0 +1,103 @@ +import numpy as np + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestValueCounts: + # GH#7735 + + def test_value_counts_unique_datetimeindex(self, tz_naive_fixture): + tz = tz_naive_fixture + orig = date_range("2011-01-01 09:00", freq="h", periods=10, tz=tz) + self._check_value_counts_with_repeats(orig) + + def test_value_counts_unique_timedeltaindex(self): + orig = timedelta_range("1 days 09:00:00", freq="h", periods=10) + self._check_value_counts_with_repeats(orig) + + def test_value_counts_unique_periodindex(self): + orig = period_range("2011-01-01 09:00", freq="h", periods=10) + self._check_value_counts_with_repeats(orig) + + def _check_value_counts_with_repeats(self, orig): + # create repeated values, 'n'th element is repeated by n+1 times + idx = type(orig)( + np.repeat(orig._values, range(1, len(orig) + 1)), dtype=orig.dtype + ) + + exp_idx = orig[::-1] + if not isinstance(exp_idx, PeriodIndex): + exp_idx = exp_idx._with_freq(None) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64", name="count") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + tm.assert_index_equal(idx.unique(), orig) + + def test_value_counts_unique_datetimeindex2(self, tz_naive_fixture): + tz = tz_naive_fixture + idx = DatetimeIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + tz=tz, + ) + self._check_value_counts_dropna(idx) + + def test_value_counts_unique_timedeltaindex2(self): + idx = TimedeltaIndex( + [ + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 08:00:00", + "1 days 08:00:00", + NaT, + ] + ) + self._check_value_counts_dropna(idx) + + def test_value_counts_unique_periodindex2(self): + idx = PeriodIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + freq="h", + ) + self._check_value_counts_dropna(idx) + + def _check_value_counts_dropna(self, idx): + exp_idx = idx[[2, 3]] + expected = Series([3, 2], index=exp_idx, name="count") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = idx[[2, 3, -1]] + expected = Series([3, 2, 1], index=exp_idx, name="count") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd02197bc4a834bcbec205e18ff39fa75f156362 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_delete.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_delete.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf7992bd976b6f5538834014842d734f5bd6b3d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_delete.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_factorize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_factorize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3a6b65f8d0448f45ac28fbd18b875782c3ae53a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_factorize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_insert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_insert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b99b7aeab7394f022e4f9bfa568908dae4b2e96f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_insert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_isocalendar.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_isocalendar.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfcede376ae747e980947b22f24f2bc17e35252e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_isocalendar.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_map.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_map.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..291004bcab21a8ed3426b22fb0870f289c50a47a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_map.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_normalize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_normalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1515cb13d0dad0e103405c09acf268d37a567cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_normalize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_repeat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_repeat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ab75a301624bc1f55646b2da03b8b0658c710a6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_repeat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_shift.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_shift.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b4e96ea736e873832093f2d830a2edd63442d5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_shift.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_frame.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_frame.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58de0a1db00bc2b780bbd41cceefd763d61a6eb2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_frame.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81c4cdb752d77ca573e508472753664f2cec6217 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_pydatetime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_pydatetime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43c1c7a3a74b46630d3311fabf5b7085fa4c9794 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_pydatetime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_series.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_series.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c47c8cee3092729646ddd1dcd7d3a1da030fe67a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_to_series.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_tz_localize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_tz_localize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f785ff6905bc2a6ff248121469fae4b0f57442b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_tz_localize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_unique.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_unique.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b5826574ad59c17ad1f1b4ca2d7ba066de8bccb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/methods/__pycache__/test_unique.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_arithmetic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7c418b27de6ddf79c87a813d43f21369ecc367 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -0,0 +1,56 @@ +# Arithmetic tests specific to DatetimeIndex are generally about `freq` +# rentention or inference. Other arithmetic tests belong in +# tests/arithmetic/test_datetime64.py +import pytest + +from pandas import ( + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexArithmetic: + def test_add_timedelta_preserves_freq(self): + # GH#37295 should hold for any DTI with freq=None or Tick freq + tz = "Canada/Eastern" + dti = date_range( + start=Timestamp("2019-03-26 00:00:00-0400", tz=tz), + end=Timestamp("2020-10-17 00:00:00-0400", tz=tz), + freq="D", + ) + result = dti + Timedelta(days=1) + assert result.freq == dti.freq + + def test_sub_datetime_preserves_freq(self, tz_naive_fixture): + # GH#48818 + dti = date_range("2016-01-01", periods=12, tz=tz_naive_fixture) + + res = dti - dti[0] + expected = timedelta_range("0 Days", "11 Days") + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq + + @pytest.mark.xfail( + reason="The inherited freq is incorrect bc dti.freq is incorrect " + "https://github.com/pandas-dev/pandas/pull/48818/files#r982793461" + ) + def test_sub_datetime_preserves_freq_across_dst(self): + # GH#48818 + ts = Timestamp("2016-03-11", tz="US/Pacific") + dti = date_range(ts, periods=4) + + res = dti - dti[0] + expected = TimedeltaIndex( + [ + Timedelta(days=0), + Timedelta(days=1), + Timedelta(days=2), + Timedelta(days=2, hours=23), + ] + ) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..2abbcf6688833ff05600d8e360711c8ff973a343 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_constructors.py @@ -0,0 +1,1204 @@ +from __future__ import annotations + +from datetime import ( + datetime, + timedelta, + timezone, +) +from functools import partial +from operator import attrgetter + +import dateutil +import dateutil.tz +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + astype_overflowsafe, + timezones, +) + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Timestamp, + date_range, + offsets, + to_datetime, +) +import pandas._testing as tm +from pandas.core.arrays import period_array + + +class TestDatetimeIndex: + def test_closed_deprecated(self): + # GH#52628 + msg = "The 'closed' keyword" + with tm.assert_produces_warning(FutureWarning, match=msg): + DatetimeIndex([], closed=True) + + def test_normalize_deprecated(self): + # GH#52628 + msg = "The 'normalize' keyword" + with tm.assert_produces_warning(FutureWarning, match=msg): + DatetimeIndex([], normalize=True) + + def test_from_dt64_unsupported_unit(self): + # GH#49292 + val = np.datetime64(1, "D") + result = DatetimeIndex([val], tz="US/Pacific") + + expected = DatetimeIndex([val.astype("M8[s]")], tz="US/Pacific") + tm.assert_index_equal(result, expected) + + def test_explicit_tz_none(self): + # GH#48659 + dti = date_range("2016-01-01", periods=10, tz="UTC") + + msg = "Passed data is timezone-aware, incompatible with 'tz=None'" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(dti, tz=None) + + with pytest.raises(ValueError, match=msg): + DatetimeIndex(np.array(dti), tz=None) + + msg = "Cannot pass both a timezone-aware dtype and tz=None" + with pytest.raises(ValueError, match=msg): + DatetimeIndex([], dtype="M8[ns, UTC]", tz=None) + + def test_freq_validation_with_nat(self): + # GH#11587 make sure we get a useful error message when generate_range + # raises + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + DatetimeIndex([pd.NaT, Timestamp("2011-01-01")], freq="D") + with pytest.raises(ValueError, match=msg): + DatetimeIndex([pd.NaT, Timestamp("2011-01-01")._value], freq="D") + + # TODO: better place for tests shared by DTI/TDI? + @pytest.mark.parametrize( + "index", + [ + date_range("2016-01-01", periods=5, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=5), + ], + ) + def test_shallow_copy_inherits_array_freq(self, index): + # If we pass a DTA/TDA to shallow_copy and dont specify a freq, + # we should inherit the array's freq, not our own. + array = index._data + + arr = array[[0, 3, 2, 4, 1]] + assert arr.freq is None + + result = index._shallow_copy(arr) + assert result.freq is None + + def test_categorical_preserves_tz(self): + # GH#18664 retain tz when going DTI-->Categorical-->DTI + dti = DatetimeIndex( + [pd.NaT, "2015-01-01", "1999-04-06 15:14:13", "2015-01-01"], tz="US/Eastern" + ) + + for dtobj in [dti, dti._data]: + # works for DatetimeIndex or DatetimeArray + + ci = pd.CategoricalIndex(dtobj) + carr = pd.Categorical(dtobj) + cser = pd.Series(ci) + + for obj in [ci, carr, cser]: + result = DatetimeIndex(obj) + tm.assert_index_equal(result, dti) + + def test_dti_with_period_data_raises(self): + # GH#23675 + data = pd.PeriodIndex(["2016Q1", "2016Q2"], freq="Q") + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + DatetimeIndex(data) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + to_datetime(data) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + DatetimeIndex(period_array(data)) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + to_datetime(period_array(data)) + + def test_dti_with_timedelta64_data_raises(self): + # GH#23675 deprecated, enforrced in GH#29794 + data = np.array([0], dtype="m8[ns]") + msg = r"timedelta64\[ns\] cannot be converted to datetime64" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(data) + + with pytest.raises(TypeError, match=msg): + to_datetime(data) + + with pytest.raises(TypeError, match=msg): + DatetimeIndex(pd.TimedeltaIndex(data)) + + with pytest.raises(TypeError, match=msg): + to_datetime(pd.TimedeltaIndex(data)) + + def test_constructor_from_sparse_array(self): + # https://github.com/pandas-dev/pandas/issues/35843 + values = [ + Timestamp("2012-05-01T01:00:00.000000"), + Timestamp("2016-05-01T01:00:00.000000"), + ] + arr = pd.arrays.SparseArray(values) + result = Index(arr) + assert type(result) is Index + assert result.dtype == arr.dtype + + def test_construction_caching(self): + df = pd.DataFrame( + { + "dt": date_range("20130101", periods=3), + "dttz": date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + Timestamp("20130101"), + pd.NaT, + Timestamp("20130103"), + ], + "dtns": date_range("20130101", periods=3, freq="ns"), + } + ) + assert df.dttz.dtype.tz.zone == "US/Eastern" + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) + def test_construction_with_alt(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = date_range("20130101", periods=5, freq="h", tz=tz) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + result = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(i, result) + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) + def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = date_range("20130101", periods=5, freq="h", tz=tz) + i = i._with_freq(None) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + + if "tz" in kwargs: + result = DatetimeIndex(i.asi8, tz="UTC").tz_convert(kwargs["tz"]) + + expected = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(result, expected) + + # localize into the provided tz + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz="UTC") + expected = i.tz_localize(None).tz_localize("UTC") + tm.assert_index_equal(i2, expected) + + # incompat tz/dtype + msg = "cannot supply both a tz and a dtype with a tz" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + + def test_construction_index_with_mixed_timezones(self): + # gh-11488: no tz results in DatetimeIndex + result = Index([Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # same tz results in DatetimeIndex + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # same tz results in DatetimeIndex (DST) + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # Different tz results in Index(dtype=object) + result = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + msg = "DatetimeIndex has mixed timezones" + msg_depr = "parsing datetimes with mixed time zones will raise an error" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg_depr): + DatetimeIndex(["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"]) + + # length = 1 + result = Index([Timestamp("2011-01-01")], name="idx") + exp = DatetimeIndex([Timestamp("2011-01-01")], name="idx") + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # length = 1 with tz + result = Index([Timestamp("2011-01-01 10:00", tz="Asia/Tokyo")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00")], tz="Asia/Tokyo", name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + def test_construction_index_with_mixed_timezones_with_NaT(self): + # see gh-11488 + result = Index( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + exp = DatetimeIndex( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # Same tz results in DatetimeIndex + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00"), + ], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # same tz results in DatetimeIndex (DST) + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # different tz results in Index(dtype=object) + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + # all NaT + result = Index([pd.NaT, pd.NaT], name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], name="idx") + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + def test_construction_dti_with_mixed_timezones(self): + # GH 11488 (not changed, added explicit tests) + + # no tz results in DatetimeIndex + result = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # same tz results in DatetimeIndex + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # same tz results in DatetimeIndex (DST) + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # tz mismatch affecting to tz-aware raises TypeError/ValueError + + msg = "cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + + # pre-2.0 this raised bc of awareness mismatch. in 2.0 with a tz# + # specified we behave as if this was called pointwise, so + # the naive Timestamp is treated as a wall time. + dti = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) + expected = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern").tz_convert("Asia/Tokyo"), + ], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(dti, expected) + + # pre-2.0 mixed-tz scalars raised even if a tz/dtype was specified. + # as of 2.0 we successfully return the requested tz/dtype + dti = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="US/Eastern", + name="idx", + ) + expected = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo").tz_convert("US/Eastern"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(dti, expected) + + # same thing but pass dtype instead of tz + dti = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="M8[ns, US/Eastern]", + name="idx", + ) + tm.assert_index_equal(dti, expected) + + def test_construction_base_constructor(self): + arr = [Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-03")] + tm.assert_index_equal(Index(arr), DatetimeIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, Timestamp("2011-01-03")] + tm.assert_index_equal(Index(arr), DatetimeIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), DatetimeIndex(np.array(arr))) + + def test_construction_outofbounds(self): + # GH 13663 + dates = [ + datetime(3000, 1, 1), + datetime(4000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + ] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + msg = "^Out of bounds nanosecond timestamp: 3000-01-01 00:00:00, at position 0$" + with pytest.raises(OutOfBoundsDatetime, match=msg): + # can't create DatetimeIndex + DatetimeIndex(dates) + + @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) + def test_dti_date_out_of_range(self, data): + # GH#1475 + msg = ( + "^Out of bounds nanosecond timestamp: " + "1400-01-01( 00:00:00)?, at position 0$" + ) + with pytest.raises(OutOfBoundsDatetime, match=msg): + DatetimeIndex(data) + + def test_construction_with_ndarray(self): + # GH 5152 + dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] + data = DatetimeIndex(dates, freq=offsets.BDay()).values + result = DatetimeIndex(data, freq=offsets.BDay()) + expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") + tm.assert_index_equal(result, expected) + + def test_integer_values_and_tz_interpreted_as_utc(self): + # GH-24559 + val = np.datetime64("2000-01-01 00:00:00", "ns") + values = np.array([val.view("i8")]) + + result = DatetimeIndex(values).tz_localize("US/Central") + + expected = DatetimeIndex(["2000-01-01T00:00:00"], dtype="M8[ns, US/Central]") + tm.assert_index_equal(result, expected) + + # but UTC is *not* deprecated. + with tm.assert_produces_warning(None): + result = DatetimeIndex(values, tz="UTC") + expected = DatetimeIndex(["2000-01-01T00:00:00"], dtype="M8[ns, UTC]") + tm.assert_index_equal(result, expected) + + def test_constructor_coverage(self): + msg = r"DatetimeIndex\(\.\.\.\) must be called with a collection" + with pytest.raises(TypeError, match=msg): + DatetimeIndex("1/1/2000") + + # generator expression + gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) + result = DatetimeIndex(gen) + expected = DatetimeIndex( + [datetime(2000, 1, 1) + timedelta(i) for i in range(10)] + ) + tm.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(["2000-01-01", "2000-01-02", "2000-01-03"]) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype("O")) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # string with NaT + strings = np.array(["2000-01-01", "2000-01-02", "NaT"]) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype("O")) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # non-conforming + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") + + @pytest.mark.parametrize("freq", ["YS", "W-SUN"]) + def test_constructor_datetime64_tzformat(self, freq): + # see GH#6572: ISO 8601 format results in stdlib timezone object + idx = date_range( + "2013-01-01T00:00:00-05:00", "2016-01-01T23:59:59-05:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=timezone(timedelta(minutes=-300)), + ) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range( + "2013-01-01T00:00:00+09:00", "2016-01-01T23:59:59+09:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=timezone(timedelta(minutes=540)), + ) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + # Non ISO 8601 format results in dateutil.tz.tzoffset + idx = date_range("2013/1/1 0:00:00-5:00", "2016/1/1 23:59:59-5:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=timezone(timedelta(minutes=-300)), + ) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range("2013/1/1 0:00:00+9:00", "2016/1/1 23:59:59+09:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=timezone(timedelta(minutes=540)), + ) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + expected = ( + DatetimeIndex(["2013-01-01", "2013-01-02"]) + .as_unit("ns") + .tz_localize("US/Eastern") + ) + tm.assert_index_equal(idx, expected) + + idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern").as_unit("ns") + tm.assert_index_equal(idx, expected) + + def test_constructor_dtype_tz_mismatch_raises(self): + # if we already have a tz and its not the same, then raise + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + + msg = ( + "cannot supply both a tz and a timezone-naive dtype " + r"\(i\.e\. datetime64\[ns\]\)" + ) + with pytest.raises(ValueError, match=msg): + DatetimeIndex(idx, dtype="datetime64[ns]") + + # this is effectively trying to convert tz's + msg = "data is already tz-aware US/Eastern, unable to set specified tz: CET" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(idx, dtype="datetime64[ns, CET]") + msg = "cannot supply both a tz and a dtype with a tz" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(idx, tz="CET", dtype="datetime64[ns, US/Eastern]") + + result = DatetimeIndex(idx, dtype="datetime64[ns, US/Eastern]") + tm.assert_index_equal(idx, result) + + @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) + def test_constructor_invalid_dtype_raises(self, dtype): + # GH 23986 + msg = "Unexpected value for 'dtype'" + with pytest.raises(ValueError, match=msg): + DatetimeIndex([1, 2], dtype=dtype) + + def test_000constructor_resolution(self): + # 2252 + t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) + idx = DatetimeIndex([t1]) + + assert idx.nanosecond[0] == t1.nanosecond + + def test_disallow_setting_tz(self): + # GH 3746 + dti = DatetimeIndex(["2010"], tz="UTC") + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): + dti.tz = pytz.timezone("US/Pacific") + + @pytest.mark.parametrize( + "tz", + [ + None, + "America/Los_Angeles", + pytz.timezone("America/Los_Angeles"), + Timestamp("2000", tz="America/Los_Angeles").tz, + ], + ) + def test_constructor_start_end_with_tz(self, tz): + # GH 18595 + start = Timestamp("2013-01-01 06:00:00", tz="America/Los_Angeles") + end = Timestamp("2013-01-02 06:00:00", tz="America/Los_Angeles") + result = date_range(freq="D", start=start, end=end, tz=tz) + expected = DatetimeIndex( + ["2013-01-01 06:00:00", "2013-01-02 06:00:00"], + dtype="M8[ns, America/Los_Angeles]", + freq="D", + ) + tm.assert_index_equal(result, expected) + # Especially assert that the timezone is consistent for pytz + assert pytz.timezone("America/Los_Angeles") is result.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) + def test_constructor_with_non_normalized_pytz(self, tz): + # GH 18595 + non_norm_tz = Timestamp("2010", tz=tz).tz + result = DatetimeIndex(["2010"], tz=non_norm_tz) + assert pytz.timezone(tz) is result.tz + + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [ + Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), + Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), + ] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) + @pytest.mark.parametrize( + "tz, dtype", + [("US/Pacific", "datetime64[ns, US/Pacific]"), (None, "datetime64[ns]")], + ) + def test_constructor_with_int_tz(self, klass, box, tz, dtype): + # GH 20997, 20964 + ts = Timestamp("2018-01-01", tz=tz).as_unit("ns") + result = klass(box([ts._value]), dtype=dtype) + expected = klass([ts]) + assert result == expected + + def test_construction_int_rountrip(self, tz_naive_fixture): + # GH 12619, GH#24559 + tz = tz_naive_fixture + + result = 1293858000000000000 + expected = DatetimeIndex([result], tz=tz).asi8[0] + assert result == expected + + def test_construction_from_replaced_timestamps_with_dst(self): + # GH 18785 + index = date_range( + Timestamp(2000, 12, 31), + Timestamp(2005, 12, 31), + freq="YE-DEC", + tz="Australia/Melbourne", + ) + result = DatetimeIndex([x.replace(month=6, day=1) for x in index]) + expected = DatetimeIndex( + [ + "2000-06-01 00:00:00", + "2001-06-01 00:00:00", + "2002-06-01 00:00:00", + "2003-06-01 00:00:00", + "2004-06-01 00:00:00", + "2005-06-01 00:00:00", + ], + tz="Australia/Melbourne", + ) + tm.assert_index_equal(result, expected) + + def test_construction_with_tz_and_tz_aware_dti(self): + # GH 23579 + dti = date_range("2016-01-01", periods=3, tz="US/Central") + msg = "data is already tz-aware US/Central, unable to set specified tz" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(dti, tz="Asia/Tokyo") + + def test_construction_with_nat_and_tzlocal(self): + tz = dateutil.tz.tzlocal() + result = DatetimeIndex(["2018", "NaT"], tz=tz) + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) + tm.assert_index_equal(result, expected) + + def test_constructor_with_ambiguous_keyword_arg(self): + # GH 35297 + + expected = DatetimeIndex( + ["2020-11-01 01:00:00", "2020-11-02 01:00:00"], + dtype="datetime64[ns, America/New_York]", + freq="D", + ambiguous=False, + ) + + # ambiguous keyword in start + timezone = "America/New_York" + start = Timestamp(year=2020, month=11, day=1, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = date_range(start=start, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + # ambiguous keyword in end + timezone = "America/New_York" + end = Timestamp(year=2020, month=11, day=2, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = date_range(end=end, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + def test_constructor_with_nonexistent_keyword_arg(self, warsaw): + # GH 35297 + timezone = warsaw + + # nonexistent keyword in start + start = Timestamp("2015-03-29 02:30:00").tz_localize( + timezone, nonexistent="shift_forward" + ) + result = date_range(start=start, periods=2, freq="h") + expected = DatetimeIndex( + [ + Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + + # nonexistent keyword in end + end = start + result = date_range(end=end, periods=2, freq="h") + expected = DatetimeIndex( + [ + Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), + Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + + def test_constructor_no_precision_raises(self): + # GH-24753, GH-24739 + + msg = "with no precision is not allowed" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(["2000"], dtype="datetime64") + + msg = "The 'datetime64' dtype has no unit. Please pass in" + with pytest.raises(ValueError, match=msg): + Index(["2000"], dtype="datetime64") + + def test_constructor_wrong_precision_raises(self): + dti = DatetimeIndex(["2000"], dtype="datetime64[us]") + assert dti.dtype == "M8[us]" + assert dti[0] == Timestamp(2000, 1, 1) + + def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self): + # GH 27011 + result = Index(np.array([Timestamp("2019", tz="UTC"), np.nan], dtype=object)) + expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_from_tzaware_datetime(self, tz): + d = [datetime(2012, 8, 19, tzinfo=tz)] + + index = DatetimeIndex(d) + assert timezones.tz_compare(index.tz, tz) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_constructors(self, tzstr): + """Test different DatetimeIndex constructions with timezone + Follow-up of GH#4229 + """ + arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] + + idx1 = to_datetime(arr).tz_localize(tzstr) + idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) + idx2 = idx2._with_freq(None) # the others all have freq=None + idx3 = DatetimeIndex(arr, tz=tzstr) + idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + + for other in [idx2, idx3, idx4]: + tm.assert_index_equal(idx1, other) + + def test_dti_construction_idempotent(self, unit): + rng = date_range( + "03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex(data=rng, tz="US/Eastern") + tm.assert_index_equal(rng, rng2) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_constructor_static_tzinfo(self, prefix): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") + index.hour + index[0] + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_convert_datetime_list(self, tzstr): + dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") + dr2 = DatetimeIndex(list(dr), name="foo", freq="D") + tm.assert_index_equal(dr, dr2) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + @pytest.mark.parametrize("use_str", [True, False]) + @pytest.mark.parametrize("box_cls", [Timestamp, DatetimeIndex]) + def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): + # GH#47471 check that we get the same raising behavior in the DTI + # constructor and Timestamp constructor + dtstr = "2013-11-03 01:59:59.999999" + item = dtstr + if not use_str: + item = Timestamp(dtstr).to_pydatetime() + if box_cls is not Timestamp: + item = [item] + + if not use_str and isinstance(tz, dateutil.tz.tzfile): + # FIXME: The Timestamp constructor here behaves differently than all + # the other cases bc with dateutil/zoneinfo tzinfos we implicitly + # get fold=0. Having this raise is not important, but having the + # behavior be consistent across cases is. + mark = pytest.mark.xfail(reason="We implicitly get fold=0.") + request.applymarker(mark) + + with pytest.raises(pytz.AmbiguousTimeError, match=dtstr): + box_cls(item, tz=tz) + + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_dti_constructor_with_non_nano_dtype(self, tz): + # GH#55756, GH#54620 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + result = DatetimeIndex(vals, dtype=dtype) + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] + exp_arr = np.array(exp_vals, dtype="M8[us]") + expected = DatetimeIndex(exp_arr, dtype="M8[us]") + if tz is not None: + expected = expected.tz_localize("UTC").tz_convert(tz) + tm.assert_index_equal(result, expected) + + result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) + tm.assert_index_equal(result2, expected) + + def test_dti_constructor_with_non_nano_now_today(self): + # GH#55756 + now = Timestamp.now() + today = Timestamp.today() + result = DatetimeIndex(["now", "today"], dtype="M8[s]") + assert result.dtype == "M8[s]" + + # result may not exactly match [now, today] so we'll test it up to a tolerance. + # (it *may* match exactly due to rounding) + tolerance = pd.Timedelta(microseconds=1) + + diff0 = result[0] - now.as_unit("s") + assert diff0 >= pd.Timedelta(0) + assert diff0 < tolerance + + diff1 = result[1] - today.as_unit("s") + assert diff1 >= pd.Timedelta(0) + assert diff1 < tolerance + + def test_dti_constructor_object_float_matches_float_dtype(self): + # GH#55780 + arr = np.array([0, np.nan], dtype=np.float64) + arr2 = arr.astype(object) + + dti1 = DatetimeIndex(arr, tz="CET") + dti2 = DatetimeIndex(arr2, tz="CET") + tm.assert_index_equal(dti1, dti2) + + @pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"]) + def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype): + # Going through the object path should match the non-object path + + vals1 = np.arange(5, dtype="i8") * 1000 + vals1[0] = pd.NaT.value + + vals2 = vals1.astype(np.float64) + vals2[0] = np.nan + + vals3 = vals1.astype(object) + # change lib.infer_dtype(vals3) from "integer" so we go through + # array_to_datetime in _sequence_to_dt64 + vals3[0] = pd.NaT + + vals4 = vals2.astype(object) + + res1 = DatetimeIndex(vals1, dtype=dtype) + res2 = DatetimeIndex(vals2, dtype=dtype) + res3 = DatetimeIndex(vals3, dtype=dtype) + res4 = DatetimeIndex(vals4, dtype=dtype) + + expected = DatetimeIndex(vals1.view("M8[us]")) + if res1.tz is not None: + expected = expected.tz_localize("UTC").tz_convert(res1.tz) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + + +class TestTimeSeries: + def test_dti_constructor_preserve_dti_freq(self): + rng = date_range("1/1/2000", "1/2/2000", freq="5min") + + rng2 = DatetimeIndex(rng) + assert rng.freq == rng2.freq + + def test_explicit_none_freq(self): + # Explicitly passing freq=None is respected + rng = date_range("1/1/2000", "1/2/2000", freq="5min") + + result = DatetimeIndex(rng, freq=None) + assert result.freq is None + + result = DatetimeIndex(rng._data, freq=None) + assert result.freq is None + + def test_dti_constructor_small_int(self, any_int_numpy_dtype): + # see gh-13721 + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.00000000", + "1970-01-01 00:00:00.00000001", + "1970-01-01 00:00:00.00000002", + ] + ) + + arr = np.array([0, 10, 20], dtype=any_int_numpy_dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) + + def test_ctor_str_intraday(self): + rng = DatetimeIndex(["1-1-2000 00:00:01"]) + assert rng[0].second == 1 + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") + idx = Index(arr) + + assert (idx.values == astype_overflowsafe(arr, dtype=np.dtype("M8[ns]"))).all() + + def test_constructor_int64_nocopy(self): + # GH#1624 + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr) + + arr[50:100] = -1 + assert (index.asi8[50:100] == -1).all() + + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr, copy=True) + + arr[50:100] = -1 + assert (index.asi8[50:100] != -1).all() + + @pytest.mark.parametrize( + "freq", + ["ME", "QE", "YE", "D", "B", "bh", "min", "s", "ms", "us", "h", "ns", "C"], + ) + def test_from_freq_recreate_from_data(self, freq): + org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) + idx = DatetimeIndex(org, freq=freq) + tm.assert_index_equal(idx, org) + + org = date_range( + start="2001/02/01 09:00", freq=freq, tz="US/Pacific", periods=1 + ) + idx = DatetimeIndex(org, freq=freq, tz="US/Pacific") + tm.assert_index_equal(idx, org) + + def test_datetimeindex_constructor_misc(self): + arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"] + msg = r"(\(')?Unknown datetime string format(:', 'Jn 3, 2005'\))?" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(arr) + + arr = ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005, 1, 1), "1/2/2005", "1/3/2005", "2005-01-04"] + idx2 = DatetimeIndex(arr) + + arr = [Timestamp(datetime(2005, 1, 1)), "1/2/2005", "1/3/2005", "2005-01-04"] + idx3 = DatetimeIndex(arr) + + arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") + idx4 = DatetimeIndex(arr) + + idx5 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx6 = DatetimeIndex( + ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True + ) + tm.assert_index_equal(idx5, idx6) + + for other in [idx2, idx3, idx4]: + assert (idx1.values == other.values).all() + + def test_dti_constructor_object_dtype_dayfirst_yearfirst_with_tz(self): + # GH#55813 + val = "5/10/16" + + dfirst = Timestamp(2016, 10, 5, tz="US/Pacific") + yfirst = Timestamp(2005, 10, 16, tz="US/Pacific") + + result1 = DatetimeIndex([val], tz="US/Pacific", dayfirst=True) + expected1 = DatetimeIndex([dfirst]) + tm.assert_index_equal(result1, expected1) + + result2 = DatetimeIndex([val], tz="US/Pacific", yearfirst=True) + expected2 = DatetimeIndex([yfirst]) + tm.assert_index_equal(result2, expected2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_date_range.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..d26bee80003e92092722790d9c38225a3b16b035 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_date_range.py @@ -0,0 +1,1721 @@ +""" +test date_range, bdate_range construction from the convenience range functions +""" + +from datetime import ( + datetime, + time, + timedelta, +) +import re + +import numpy as np +import pytest +import pytz +from pytz import timezone + +from pandas._libs.tslibs import timezones +from pandas._libs.tslibs.offsets import ( + BDay, + CDay, + DateOffset, + MonthEnd, + prefix_mapping, +) +from pandas.errors import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + Timedelta, + Timestamp, + bdate_range, + date_range, + offsets, +) +import pandas._testing as tm +from pandas.core.arrays.datetimes import _generate_range as generate_range +from pandas.tests.indexes.datetimes.test_timezones import ( + FixedOffset, + fixed_off_no_name, +) + +from pandas.tseries.holiday import USFederalHolidayCalendar + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +def _get_expected_range( + begin_to_match, + end_to_match, + both_range, + inclusive_endpoints, +): + """Helper to get expected range from a both inclusive range""" + left_match = begin_to_match == both_range[0] + right_match = end_to_match == both_range[-1] + + if inclusive_endpoints == "left" and right_match: + expected_range = both_range[:-1] + elif inclusive_endpoints == "right" and left_match: + expected_range = both_range[1:] + elif inclusive_endpoints == "neither" and left_match and right_match: + expected_range = both_range[1:-1] + elif inclusive_endpoints == "neither" and right_match: + expected_range = both_range[:-1] + elif inclusive_endpoints == "neither" and left_match: + expected_range = both_range[1:] + elif inclusive_endpoints == "both": + expected_range = both_range[:] + else: + expected_range = both_range[:] + + return expected_range + + +class TestTimestampEquivDateRange: + # Older tests in TestTimeSeries constructed their `stamp` objects + # using `date_range` instead of the `Timestamp` constructor. + # TestTimestampEquivDateRange checks that these are equivalent in the + # pertinent cases. + + def test_date_range_timestamp_equiv(self): + rng = date_range("20090415", "20090519", tz="US/Eastern") + stamp = rng[0] + + ts = Timestamp("20090415", tz="US/Eastern") + assert ts == stamp + + def test_date_range_timestamp_equiv_dateutil(self): + rng = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + stamp = rng[0] + + ts = Timestamp("20090415", tz="dateutil/US/Eastern") + assert ts == stamp + + def test_date_range_timestamp_equiv_explicit_pytz(self): + rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + stamp = rng[0] + + ts = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) + assert ts == stamp + + @td.skip_if_windows + def test_date_range_timestamp_equiv_explicit_dateutil(self): + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + + rng = date_range("20090415", "20090519", tz=gettz("US/Eastern")) + stamp = rng[0] + + ts = Timestamp("20090415", tz=gettz("US/Eastern")) + assert ts == stamp + + def test_date_range_timestamp_equiv_from_datetime_instance(self): + datetime_instance = datetime(2014, 3, 4) + # build a timestamp with a frequency, since then it supports + # addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq="D")[0] + + ts = Timestamp(datetime_instance) + assert ts == timestamp_instance + + def test_date_range_timestamp_equiv_preserve_frequency(self): + timestamp_instance = date_range("2014-03-05", periods=1, freq="D")[0] + ts = Timestamp("2014-03-05") + + assert timestamp_instance == ts + + +class TestDateRanges: + def test_date_range_name(self): + idx = date_range(start="2000-01-01", periods=1, freq="YE", name="TEST") + assert idx.name == "TEST" + + def test_date_range_invalid_periods(self): + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + date_range(start="1/1/2000", periods="foo", freq="D") + + def test_date_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = date_range("1/1/2000", periods=10.5) + exp = date_range("1/1/2000", periods=10) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2ME", "2M"), + ("2SME", "2SM"), + ("2BQE", "2BQ"), + ("2BYE", "2BY"), + ], + ) + def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): + # GH#52064 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + def test_date_range_tuple_freq_raises(self): + # GH#34703 + edate = datetime(2000, 1, 1) + with pytest.raises(TypeError, match="pass as a string instead"): + date_range(end=edate, freq=("D", 5), periods=20) + + @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "h", "D"]) + def test_date_range_edges(self, freq): + # GH#13672 + td = Timedelta(f"1{freq}") + ts = Timestamp("1970-01-01") + + idx = date_range( + start=ts + td, + end=ts + 4 * td, + freq=freq, + ) + exp = DatetimeIndex( + [ts + n * td for n in range(1, 5)], + dtype="M8[ns]", + freq=freq, + ) + tm.assert_index_equal(idx, exp) + + # start after end + idx = date_range( + start=ts + 4 * td, + end=ts + td, + freq=freq, + ) + exp = DatetimeIndex([], dtype="M8[ns]", freq=freq) + tm.assert_index_equal(idx, exp) + + # start matches end + idx = date_range( + start=ts + td, + end=ts + td, + freq=freq, + ) + exp = DatetimeIndex([ts + td], dtype="M8[ns]", freq=freq) + tm.assert_index_equal(idx, exp) + + def test_date_range_near_implementation_bound(self): + # GH#??? + freq = Timedelta(1) + + with pytest.raises(OutOfBoundsDatetime, match="Cannot generate range with"): + date_range(end=Timestamp.min, periods=2, freq=freq) + + def test_date_range_nat(self): + # GH#11587 + msg = "Neither `start` nor `end` can be NaT" + with pytest.raises(ValueError, match=msg): + date_range(start="2016-01-01", end=pd.NaT, freq="D") + with pytest.raises(ValueError, match=msg): + date_range(start=pd.NaT, end="2016-01-01", freq="D") + + def test_date_range_multiplication_overflow(self): + # GH#24255 + # check that overflows in calculating `addend = periods * stride` + # are caught + with tm.assert_produces_warning(None): + # we should _not_ be seeing a overflow RuntimeWarning + dti = date_range(start="1677-09-22", periods=213503, freq="D") + + assert dti[0] == Timestamp("1677-09-22") + assert len(dti) == 213503 + + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range("1969-05-04", periods=200000000, freq="30000D") + + def test_date_range_unsigned_overflow_handling(self): + # GH#24255 + # case where `addend = periods * stride` overflows int64 bounds + # but not uint64 bounds + dti = date_range(start="1677-09-22", end="2262-04-11", freq="D") + + dti2 = date_range(start=dti[0], periods=len(dti), freq="D") + assert dti2.equals(dti) + + dti3 = date_range(end=dti[-1], periods=len(dti), freq="D") + assert dti3.equals(dti) + + def test_date_range_int64_overflow_non_recoverable(self): + # GH#24255 + # case with start later than 1970-01-01, overflow int64 but not uint64 + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start="1970-02-01", periods=106752 * 24, freq="h") + + # case with end before 1970-01-01, overflow int64 but not uint64 + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end="1969-11-14", periods=106752 * 24, freq="h") + + @pytest.mark.slow + @pytest.mark.parametrize( + "s_ts, e_ts", [("2262-02-23", "1969-11-14"), ("1970-02-01", "1677-10-22")] + ) + def test_date_range_int64_overflow_stride_endpoint_different_signs( + self, s_ts, e_ts + ): + # cases where stride * periods overflow int64 and stride/endpoint + # have different signs + start = Timestamp(s_ts) + end = Timestamp(e_ts) + + expected = date_range(start=start, end=end, freq="-1h") + assert expected[0] == start + assert expected[-1] == end + + dti = date_range(end=end, periods=len(expected), freq="-1h") + tm.assert_index_equal(dti, expected) + + def test_date_range_out_of_bounds(self): + # GH#14187 + msg = "Cannot generate range" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range("2016-01-01", periods=100000, freq="D") + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end="1763-10-12", periods=100000, freq="D") + + def test_date_range_gen_error(self): + rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") + assert len(rng) == 4 + + def test_date_range_normalize(self): + snap = datetime.today() + n = 50 + + rng = date_range(snap, periods=n, normalize=False, freq="2D") + + offset = timedelta(2) + expected = DatetimeIndex( + [snap + i * offset for i in range(n)], dtype="M8[ns]", freq=offset + ) + + tm.assert_index_equal(rng, expected) + + rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") + the_time = time(8, 15) + for val in rng: + assert val.time() == the_time + + def test_date_range_ambiguous_arguments(self): + # #2538 + start = datetime(2011, 1, 1, 5, 3, 40) + end = datetime(2011, 1, 1, 8, 9, 40) + + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + with pytest.raises(ValueError, match=msg): + date_range(start, end, periods=10, freq="s") + + def test_date_range_convenience_periods(self, unit): + # GH 20808 + result = date_range("2018-04-24", "2018-04-27", periods=3, unit=unit) + expected = DatetimeIndex( + ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + dtype=f"M8[{unit}]", + freq=None, + ) + + tm.assert_index_equal(result, expected) + + # Test if spacing remains linear if tz changes to dst in range + result = date_range( + "2018-04-01 01:00:00", + "2018-04-01 04:00:00", + tz="Australia/Sydney", + periods=3, + unit=unit, + ) + expected = DatetimeIndex( + [ + Timestamp("2018-04-01 01:00:00+1100", tz="Australia/Sydney"), + Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), + Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), + ] + ).as_unit(unit) + tm.assert_index_equal(result, expected) + + def test_date_range_index_comparison(self): + rng = date_range("2011-01-01", periods=3, tz="US/Eastern") + df = Series(rng).to_frame() + arr = np.array([rng.to_list()]).T + arr2 = np.array([rng]).T + + with pytest.raises(ValueError, match="Unable to coerce to Series"): + rng == df + + with pytest.raises(ValueError, match="Unable to coerce to Series"): + df == rng + + expected = DataFrame([True, True, True]) + + results = df == arr2 + tm.assert_frame_equal(results, expected) + + expected = Series([True, True, True], name=0) + + results = df[0] == arr2[:, 0] + tm.assert_series_equal(results, expected) + + expected = np.array( + [[True, False, False], [False, True, False], [False, False, True]] + ) + results = rng == arr + tm.assert_numpy_array_equal(results, expected) + + @pytest.mark.parametrize( + "start,end,result_tz", + [ + ["20180101", "20180103", "US/Eastern"], + [datetime(2018, 1, 1), datetime(2018, 1, 3), "US/Eastern"], + [Timestamp("20180101"), Timestamp("20180103"), "US/Eastern"], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + "US/Eastern", + ], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + None, + ], + ], + ) + def test_date_range_linspacing_tz(self, start, end, result_tz): + # GH 20983 + result = date_range(start, end, periods=3, tz=result_tz) + expected = date_range("20180101", periods=3, freq="D", tz="US/Eastern") + tm.assert_index_equal(result, expected) + + def test_date_range_timedelta(self): + start = "2020-01-01" + end = "2020-01-11" + rng1 = date_range(start, end, freq="3D") + rng2 = date_range(start, end, freq=timedelta(days=3)) + tm.assert_index_equal(rng1, rng2) + + def test_range_misspecified(self): + # GH #1095 + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + + with pytest.raises(ValueError, match=msg): + date_range(start="1/1/2000") + + with pytest.raises(ValueError, match=msg): + date_range(end="1/1/2000") + + with pytest.raises(ValueError, match=msg): + date_range(periods=10) + + with pytest.raises(ValueError, match=msg): + date_range(start="1/1/2000", freq="h") + + with pytest.raises(ValueError, match=msg): + date_range(end="1/1/2000", freq="h") + + with pytest.raises(ValueError, match=msg): + date_range(periods=10, freq="h") + + with pytest.raises(ValueError, match=msg): + date_range() + + def test_compat_replace(self): + # https://github.com/statsmodels/statsmodels/issues/3349 + # replace should take ints/longs for compat + result = date_range(Timestamp("1960-04-01 00:00:00"), periods=76, freq="QS-JAN") + assert len(result) == 76 + + def test_catch_infinite_loop(self): + offset = offsets.DateOffset(minute=5) + # blow up, don't loop forever + msg = "Offset did not increment date" + with pytest.raises(ValueError, match=msg): + date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) + + def test_construct_over_dst(self, unit): + # GH 20854 + pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=True + ) + pst_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=False + ) + expect_data = [ + Timestamp("2010-11-07 00:00:00", tz="US/Pacific"), + pre_dst, + pst_dst, + ] + expected = DatetimeIndex(expect_data, freq="h").as_unit(unit) + result = date_range( + start="2010-11-7", periods=3, freq="h", tz="US/Pacific", unit=unit + ) + tm.assert_index_equal(result, expected) + + def test_construct_with_different_start_end_string_format(self, unit): + # GH 12064 + result = date_range( + "2013-01-01 00:00:00+09:00", + "2013/01/01 02:00:00+09:00", + freq="h", + unit=unit, + ) + expected = DatetimeIndex( + [ + Timestamp("2013-01-01 00:00:00+09:00"), + Timestamp("2013-01-01 01:00:00+09:00"), + Timestamp("2013-01-01 02:00:00+09:00"), + ], + freq="h", + ).as_unit(unit) + tm.assert_index_equal(result, expected) + + def test_error_with_zero_monthends(self): + msg = r"Offset <0 \* MonthEnds> did not increment date" + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) + + def test_range_bug(self, unit): + # GH #770 + offset = DateOffset(months=3) + result = date_range("2011-1-1", "2012-1-31", freq=offset, unit=unit) + + start = datetime(2011, 1, 1) + expected = DatetimeIndex( + [start + i * offset for i in range(5)], dtype=f"M8[{unit}]", freq=offset + ) + tm.assert_index_equal(result, expected) + + def test_range_tz_pytz(self): + # see gh-2906 + tz = timezone("US/Eastern") + start = tz.localize(datetime(2011, 1, 1)) + end = tz.localize(datetime(2011, 1, 3)) + + dr = date_range(start=start, periods=3) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + dr = date_range(end=end, periods=3) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + dr = date_range(start=start, end=end) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + @pytest.mark.parametrize( + "start, end", + [ + [ + Timestamp(datetime(2014, 3, 6), tz="US/Eastern"), + Timestamp(datetime(2014, 3, 12), tz="US/Eastern"), + ], + [ + Timestamp(datetime(2013, 11, 1), tz="US/Eastern"), + Timestamp(datetime(2013, 11, 6), tz="US/Eastern"), + ], + ], + ) + def test_range_tz_dst_straddle_pytz(self, start, end): + dr = date_range(start, end, freq="D") + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + dr = date_range(start, end, freq="D", tz="US/Eastern") + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + dr = date_range( + start.replace(tzinfo=None), + end.replace(tzinfo=None), + freq="D", + tz="US/Eastern", + ) + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + def test_range_tz_dateutil(self): + # see gh-2906 + + # Use maybe_get_tz to fix filename in tz under dateutil. + from pandas._libs.tslibs.timezones import maybe_get_tz + + tz = lambda x: maybe_get_tz("dateutil/" + x) + + start = datetime(2011, 1, 1, tzinfo=tz("US/Eastern")) + end = datetime(2011, 1, 3, tzinfo=tz("US/Eastern")) + + dr = date_range(start=start, periods=3) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + dr = date_range(end=end, periods=3) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + dr = date_range(start=start, end=end) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "YE"]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_range_closed(self, freq, tz, inclusive_endpoints_fixture): + # GH#12409, GH#12684 + + begin = Timestamp("2011/1/1", tz=tz) + end = Timestamp("2014/1/1", tz=tz) + + result_range = date_range( + begin, end, inclusive=inclusive_endpoints_fixture, freq=freq + ) + both_range = date_range(begin, end, inclusive="both", freq=freq) + expected_range = _get_expected_range( + begin, end, both_range, inclusive_endpoints_fixture + ) + + tm.assert_index_equal(expected_range, result_range) + + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "YE"]) + def test_range_with_tz_closed_with_tz_aware_start_end( + self, freq, inclusive_endpoints_fixture + ): + begin = Timestamp("2011/1/1") + end = Timestamp("2014/1/1") + begintz = Timestamp("2011/1/1", tz="US/Eastern") + endtz = Timestamp("2014/1/1", tz="US/Eastern") + + result_range = date_range( + begin, + end, + inclusive=inclusive_endpoints_fixture, + freq=freq, + tz="US/Eastern", + ) + both_range = date_range( + begin, end, inclusive="both", freq=freq, tz="US/Eastern" + ) + expected_range = _get_expected_range( + begintz, + endtz, + both_range, + inclusive_endpoints_fixture, + ) + + tm.assert_index_equal(expected_range, result_range) + + def test_range_closed_boundary(self, inclusive_endpoints_fixture): + # GH#11804 + right_boundary = date_range( + "2015-09-12", + "2015-12-01", + freq="QS-MAR", + inclusive=inclusive_endpoints_fixture, + ) + left_boundary = date_range( + "2015-09-01", + "2015-09-12", + freq="QS-MAR", + inclusive=inclusive_endpoints_fixture, + ) + both_boundary = date_range( + "2015-09-01", + "2015-12-01", + freq="QS-MAR", + inclusive=inclusive_endpoints_fixture, + ) + neither_boundary = date_range( + "2015-09-11", + "2015-09-12", + freq="QS-MAR", + inclusive=inclusive_endpoints_fixture, + ) + + expected_right = both_boundary + expected_left = both_boundary + expected_both = both_boundary + + if inclusive_endpoints_fixture == "right": + expected_left = both_boundary[1:] + elif inclusive_endpoints_fixture == "left": + expected_right = both_boundary[:-1] + elif inclusive_endpoints_fixture == "both": + expected_right = both_boundary[1:] + expected_left = both_boundary[:-1] + + expected_neither = both_boundary[1:-1] + + tm.assert_index_equal(right_boundary, expected_right) + tm.assert_index_equal(left_boundary, expected_left) + tm.assert_index_equal(both_boundary, expected_both) + tm.assert_index_equal(neither_boundary, expected_neither) + + def test_date_range_years_only(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH#6961 + rng1 = date_range("2014", "2015", freq="ME", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="ME", tz=tz) + tm.assert_index_equal(rng1, expected1) + + rng2 = date_range("2014", "2015", freq="MS", tz=tz) + expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) + tm.assert_index_equal(rng2, expected2) + + rng3 = date_range("2014", "2020", freq="YE", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="YE", tz=tz) + tm.assert_index_equal(rng3, expected3) + + rng4 = date_range("2014", "2020", freq="YS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="YS", tz=tz) + tm.assert_index_equal(rng4, expected4) + + def test_freq_divides_end_in_nanos(self): + # GH 10885 + result_1 = date_range("2005-01-12 10:00", "2005-01-12 16:00", freq="345min") + result_2 = date_range("2005-01-13 10:00", "2005-01-13 16:00", freq="345min") + expected_1 = DatetimeIndex( + ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], + dtype="datetime64[ns]", + freq="345min", + tz=None, + ) + expected_2 = DatetimeIndex( + ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], + dtype="datetime64[ns]", + freq="345min", + tz=None, + ) + tm.assert_index_equal(result_1, expected_1) + tm.assert_index_equal(result_2, expected_2) + + def test_cached_range_bug(self): + rng = date_range("2010-09-01 05:00:00", periods=50, freq=DateOffset(hours=6)) + assert len(rng) == 50 + assert rng[0] == datetime(2010, 9, 1, 5) + + def test_timezone_comparison_bug(self): + # smoke test + start = Timestamp("20130220 10:00", tz="US/Eastern") + result = date_range(start, periods=2, tz="US/Eastern") + assert len(result) == 2 + + def test_timezone_comparison_assert(self): + start = Timestamp("20130220 10:00", tz="US/Eastern") + msg = "Inferred time zone not equal to passed time zone" + with pytest.raises(AssertionError, match=msg): + date_range(start, periods=2, tz="Europe/Berlin") + + def test_negative_non_tick_frequency_descending_dates(self, tz_aware_fixture): + # GH 23270 + tz = tz_aware_fixture + result = date_range(start="2011-06-01", end="2011-01-01", freq="-1MS", tz=tz) + expected = date_range(end="2011-06-01", start="2011-01-01", freq="1MS", tz=tz)[ + ::-1 + ] + tm.assert_index_equal(result, expected) + + def test_range_where_start_equal_end(self, inclusive_endpoints_fixture): + # GH 43394 + start = "2021-09-02" + end = "2021-09-02" + result = date_range( + start=start, end=end, freq="D", inclusive=inclusive_endpoints_fixture + ) + + both_range = date_range(start=start, end=end, freq="D", inclusive="both") + if inclusive_endpoints_fixture == "neither": + expected = both_range[1:-1] + elif inclusive_endpoints_fixture in ("left", "right", "both"): + expected = both_range[:] + + tm.assert_index_equal(result, expected) + + def test_freq_dateoffset_with_relateivedelta_nanos(self): + # GH 46877 + freq = DateOffset(hours=10, days=57, nanoseconds=3) + result = date_range(end="1970-01-01 00:00:00", periods=10, freq=freq, name="a") + expected = DatetimeIndex( + [ + "1968-08-02T05:59:59.999999973", + "1968-09-28T15:59:59.999999976", + "1968-11-25T01:59:59.999999979", + "1969-01-21T11:59:59.999999982", + "1969-03-19T21:59:59.999999985", + "1969-05-16T07:59:59.999999988", + "1969-07-12T17:59:59.999999991", + "1969-09-08T03:59:59.999999994", + "1969-11-04T13:59:59.999999997", + "1970-01-01T00:00:00.000000000", + ], + name="a", + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("h", "H"), + ("2min", "2T"), + ("1s", "1S"), + ("2ms", "2L"), + ("1us", "1U"), + ("2ns", "2N"), + ], + ) + def test_frequencies_H_T_S_L_U_N_deprecated(self, freq, freq_depr): + # GH#52536 + freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] + freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = ( + f"'{freq_depr_msg}' is deprecated and will be removed in a future version, " + ) + f"please use '{freq_msg}' instead" + + expected = date_range("1/1/2000", periods=2, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("1/1/2000", periods=2, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("200YE", "200A"), + ("YE", "Y"), + ("2YE-MAY", "2A-MAY"), + ("YE-MAY", "Y-MAY"), + ], + ) + def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): + # GH#9586, GH#54275 + freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] + freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_depr_msg}' is deprecated and will be removed " + f"in a future version, please use '{freq_msg}' instead." + + expected = date_range("1/1/2000", periods=2, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("1/1/2000", periods=2, freq=freq_depr) + tm.assert_index_equal(result, expected) + + def test_to_offset_with_lowercase_deprecated_freq(self) -> None: + # https://github.com/pandas-dev/pandas/issues/56847 + msg = ( + "'m' is deprecated and will be removed in a future version, please use " + "'ME' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("2010-01-01", periods=2, freq="m") + expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + tm.assert_index_equal(result, expected) + + def test_date_range_bday(self): + sdate = datetime(1999, 12, 25) + idx = date_range(start=sdate, freq="1B", periods=20) + assert len(idx) == 20 + assert idx[0] == sdate + 0 * offsets.BDay() + assert idx.freq == "B" + + +class TestDateRangeTZ: + """Tests for date_range with timezones""" + + def test_hongkong_tz_convert(self): + # GH#1673 smoke test + dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") + + # it works! + dr.hour + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_span_dst_transition(self, tzstr): + # GH#1778 + + # Standard -> Daylight Savings Time + dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") + + assert (dr.hour == 0).all() + + dr = date_range("2012-11-02", periods=10, tz=tzstr) + result = dr.hour + expected = pd.Index([0] * 10, dtype="int32") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_timezone_str_argument(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + result = date_range("1/1/2000", periods=10, tz=tzstr) + expected = date_range("1/1/2000", periods=10, tz=tz) + + tm.assert_index_equal(result, expected) + + def test_date_range_with_fixed_tz(self): + off = FixedOffset(420, "+07:00") + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + rng2 = date_range(start, periods=len(rng), tz=off) + tm.assert_index_equal(rng, rng2) + + rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") + assert (rng.values == rng3.values).all() + + def test_date_range_with_fixedoffset_noname(self): + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + idx = pd.Index([start, end]) + assert off == idx.tz + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_with_tz(self, tzstr): + stamp = Timestamp("3/11/2012 05:00", tz=tzstr) + assert stamp.hour == 5 + + rng = date_range("3/11/2012 04:00", periods=10, freq="h", tz=tzstr) + + assert stamp == rng[1] + + @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) + def test_date_range_ambiguous_endpoint(self, tz): + # construction with an ambiguous end-point + # GH#11626 + + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + date_range( + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" + ) + + times = date_range( + "2013-10-26 23:00", "2013-10-27 01:00", freq="h", tz=tz, ambiguous="infer" + ) + assert times[0] == Timestamp("2013-10-26 23:00", tz=tz) + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz) + + @pytest.mark.parametrize( + "tz, option, expected", + [ + ["US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["US/Pacific", "shift_backward", "2019-03-10 01:00"], + ["dateutil/US/Pacific", "shift_backward", "2019-03-10 01:00"], + ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], + ], + ) + def test_date_range_nonexistent_endpoint(self, tz, option, expected): + # construction with an nonexistent end-point + + with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): + date_range( + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" + ) + + times = date_range( + "2019-03-10 00:00", "2019-03-10 02:00", freq="h", tz=tz, nonexistent=option + ) + assert times[-1] == Timestamp(expected, tz=tz) + + +class TestGenRangeGeneration: + @pytest.mark.parametrize( + "freqstr,offset", + [ + ("B", BDay()), + ("C", CDay()), + ], + ) + def test_generate(self, freqstr, offset): + rng1 = list(generate_range(START, END, periods=None, offset=offset, unit="ns")) + rng2 = list(generate_range(START, END, periods=None, offset=freqstr, unit="ns")) + assert rng1 == rng2 + + def test_1(self): + rng = list( + generate_range( + start=datetime(2009, 3, 25), + end=None, + periods=2, + offset=BDay(), + unit="ns", + ) + ) + expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)] + assert rng == expected + + def test_2(self): + rng = list( + generate_range( + start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3), + periods=None, + offset=BDay(), + unit="ns", + ) + ) + expected = [datetime(2008, 1, 1), datetime(2008, 1, 2), datetime(2008, 1, 3)] + assert rng == expected + + def test_3(self): + rng = list( + generate_range( + start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6), + periods=None, + offset=BDay(), + unit="ns", + ) + ) + expected = [] + assert rng == expected + + def test_precision_finer_than_offset(self): + # GH#9907 + result1 = date_range( + start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="QE" + ) + result2 = date_range( + start="2015-04-15 00:00:03", end="2015-06-22 00:00:04", freq="W" + ) + expected1_list = [ + "2015-06-30 00:00:03", + "2015-09-30 00:00:03", + "2015-12-31 00:00:03", + "2016-03-31 00:00:03", + ] + expected2_list = [ + "2015-04-19 00:00:03", + "2015-04-26 00:00:03", + "2015-05-03 00:00:03", + "2015-05-10 00:00:03", + "2015-05-17 00:00:03", + "2015-05-24 00:00:03", + "2015-05-31 00:00:03", + "2015-06-07 00:00:03", + "2015-06-14 00:00:03", + "2015-06-21 00:00:03", + ] + expected1 = DatetimeIndex( + expected1_list, dtype="datetime64[ns]", freq="QE-DEC", tz=None + ) + expected2 = DatetimeIndex( + expected2_list, dtype="datetime64[ns]", freq="W-SUN", tz=None + ) + tm.assert_index_equal(result1, expected1) + tm.assert_index_equal(result2, expected2) + + dt1, dt2 = "2017-01-01", "2017-01-01" + tz1, tz2 = "US/Eastern", "Europe/London" + + @pytest.mark.parametrize( + "start,end", + [ + (Timestamp(dt1, tz=tz1), Timestamp(dt2)), + (Timestamp(dt1), Timestamp(dt2, tz=tz2)), + (Timestamp(dt1, tz=tz1), Timestamp(dt2, tz=tz2)), + (Timestamp(dt1, tz=tz2), Timestamp(dt2, tz=tz1)), + ], + ) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(TypeError, match=msg): + date_range(start, end) + with pytest.raises(TypeError, match=msg): + date_range(start, end, freq=BDay()) + + +class TestBusinessDateRange: + def test_constructor(self): + bdate_range(START, END, freq=BDay()) + bdate_range(START, periods=20, freq=BDay()) + bdate_range(end=START, periods=20, freq=BDay()) + + msg = "periods must be a number, got B" + with pytest.raises(TypeError, match=msg): + date_range("2011-1-1", "2012-1-1", "B") + + with pytest.raises(TypeError, match=msg): + bdate_range("2011-1-1", "2012-1-1", "B") + + msg = "freq must be specified for bdate_range; use date_range instead" + with pytest.raises(TypeError, match=msg): + bdate_range(START, END, periods=10, freq=None) + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20) + firstDate = end - 19 * BDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = "2007/100/1" + + msg = "Unknown datetime string format, unable to parse: 2007/100/1" + with pytest.raises(ValueError, match=msg): + Timestamp(badly_formed_date) + + with pytest.raises(ValueError, match=msg): + bdate_range(start=badly_formed_date, periods=10) + + with pytest.raises(ValueError, match=msg): + bdate_range(end=badly_formed_date, periods=10) + + with pytest.raises(ValueError, match=msg): + bdate_range(badly_formed_date, badly_formed_date) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range("12/5/2011", "12/5/2011") + rng2 = bdate_range("12/2/2011", "12/5/2011") + assert rng2._data.freq == BDay() + + result = rng1.union(rng2) + assert isinstance(result, DatetimeIndex) + + @pytest.mark.parametrize("inclusive", ["left", "right", "neither", "both"]) + def test_bdays_and_open_boundaries(self, inclusive): + # GH 6673 + start = "2018-07-21" # Saturday + end = "2018-07-29" # Sunday + result = date_range(start, end, freq="B", inclusive=inclusive) + + bday_start = "2018-07-23" # Monday + bday_end = "2018-07-27" # Friday + expected = date_range(bday_start, bday_end, freq="D") + tm.assert_index_equal(result, expected) + # Note: we do _not_ expect the freqs to match here + + def test_bday_near_overflow(self): + # GH#24252 avoid doing unnecessary addition that _would_ overflow + start = Timestamp.max.floor("D").to_pydatetime() + rng = date_range(start, end=None, periods=1, freq="B") + expected = DatetimeIndex([start], freq="B").as_unit("ns") + tm.assert_index_equal(rng, expected) + + def test_bday_overflow_error(self): + # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + msg = "Out of bounds nanosecond timestamp" + start = Timestamp.max.floor("D").to_pydatetime() + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start, periods=2, freq="B") + + +class TestCustomDateRange: + def test_constructor(self): + bdate_range(START, END, freq=CDay()) + bdate_range(START, periods=20, freq=CDay()) + bdate_range(end=START, periods=20, freq=CDay()) + + msg = "periods must be a number, got C" + with pytest.raises(TypeError, match=msg): + date_range("2011-1-1", "2012-1-1", "C") + + with pytest.raises(TypeError, match=msg): + bdate_range("2011-1-1", "2012-1-1", "C") + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20, freq="C") + firstDate = end - 19 * CDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range("12/5/2011", "12/5/2011", freq="C") + rng2 = bdate_range("12/2/2011", "12/5/2011", freq="C") + assert rng2._data.freq == CDay() + + result = rng1.union(rng2) + assert isinstance(result, DatetimeIndex) + + def test_cdaterange(self, unit): + result = bdate_range("2013-05-01", periods=3, freq="C", unit=unit) + expected = DatetimeIndex( + ["2013-05-01", "2013-05-02", "2013-05-03"], dtype=f"M8[{unit}]", freq="C" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + def test_cdaterange_weekmask(self, unit): + result = bdate_range( + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu", unit=unit + ) + expected = DatetimeIndex( + ["2013-05-01", "2013-05-02", "2013-05-05"], + dtype=f"M8[{unit}]", + freq=result.freq, + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") + + def test_cdaterange_holidays(self, unit): + result = bdate_range( + "2013-05-01", periods=3, freq="C", holidays=["2013-05-01"], unit=unit + ) + expected = DatetimeIndex( + ["2013-05-02", "2013-05-03", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) + + def test_cdaterange_weekmask_and_holidays(self, unit): + result = bdate_range( + "2013-05-01", + periods=3, + freq="C", + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + unit=unit, + ) + expected = DatetimeIndex( + ["2013-05-02", "2013-05-05", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + def test_cdaterange_holidays_weekmask_requires_freqstr(self): + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range( + "2013-05-01", + periods=3, + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + + @pytest.mark.parametrize( + "freq", [freq for freq in prefix_mapping if freq.startswith("C")] + ) + def test_all_custom_freq(self, freq): + # should not raise + bdate_range( + START, END, freq=freq, weekmask="Mon Wed Fri", holidays=["2009-03-14"] + ) + + bad_freq = freq + "FOO" + msg = f"invalid custom frequency string: {bad_freq}" + with pytest.raises(ValueError, match=msg): + bdate_range(START, END, freq=bad_freq) + + @pytest.mark.parametrize( + "start_end", + [ + ("2018-01-01T00:00:01.000Z", "2018-01-03T00:00:01.000Z"), + ("2018-01-01T00:00:00.010Z", "2018-01-03T00:00:00.010Z"), + ("2001-01-01T00:00:00.010Z", "2001-01-03T00:00:00.010Z"), + ], + ) + def test_range_with_millisecond_resolution(self, start_end): + # https://github.com/pandas-dev/pandas/issues/24110 + start, end = start_end + result = date_range(start=start, end=end, periods=2, inclusive="left") + expected = DatetimeIndex([start], dtype="M8[ns, UTC]") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start,period,expected", + [ + ("2022-07-23 00:00:00+02:00", 1, ["2022-07-25 00:00:00+02:00"]), + ("2022-07-22 00:00:00+02:00", 1, ["2022-07-22 00:00:00+02:00"]), + ( + "2022-07-22 00:00:00+02:00", + 2, + ["2022-07-22 00:00:00+02:00", "2022-07-25 00:00:00+02:00"], + ), + ], + ) + def test_range_with_timezone_and_custombusinessday(self, start, period, expected): + # GH49441 + result = date_range(start=start, periods=period, freq="C") + expected = DatetimeIndex(expected).as_unit("ns") + tm.assert_index_equal(result, expected) + + +class TestDateRangeNonNano: + def test_date_range_reso_validation(self): + msg = "'unit' must be one of 's', 'ms', 'us', 'ns'" + with pytest.raises(ValueError, match=msg): + date_range("2016-01-01", "2016-03-04", periods=3, unit="h") + + def test_date_range_freq_higher_than_reso(self): + # freq being higher-resolution than reso is a problem + msg = "Use a lower freq or a higher unit instead" + with pytest.raises(ValueError, match=msg): + # # TODO give a more useful or informative message? + date_range("2016-01-01", "2016-01-02", freq="ns", unit="ms") + + def test_date_range_freq_matches_reso(self): + # GH#49106 matching reso is OK + dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="ms", unit="ms") + rng = np.arange(1_451_606_400_000, 1_451_606_401_001, dtype=np.int64) + expected = DatetimeIndex(rng.view("M8[ms]"), freq="ms") + tm.assert_index_equal(dti, expected) + + dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="us", unit="us") + rng = np.arange(1_451_606_400_000_000, 1_451_606_401_000_001, dtype=np.int64) + expected = DatetimeIndex(rng.view("M8[us]"), freq="us") + tm.assert_index_equal(dti, expected) + + dti = date_range("2016-01-01", "2016-01-01 00:00:00.001", freq="ns", unit="ns") + rng = np.arange( + 1_451_606_400_000_000_000, 1_451_606_400_001_000_001, dtype=np.int64 + ) + expected = DatetimeIndex(rng.view("M8[ns]"), freq="ns") + tm.assert_index_equal(dti, expected) + + def test_date_range_freq_lower_than_endpoints(self): + start = Timestamp("2022-10-19 11:50:44.719781") + end = Timestamp("2022-10-19 11:50:47.066458") + + # start and end cannot be cast to "s" unit without lossy rounding, + # so we do not allow this in date_range + with pytest.raises(ValueError, match="Cannot losslessly convert units"): + date_range(start, end, periods=3, unit="s") + + # but we can losslessly cast to "us" + dti = date_range(start, end, periods=2, unit="us") + rng = np.array( + [start.as_unit("us")._value, end.as_unit("us")._value], dtype=np.int64 + ) + expected = DatetimeIndex(rng.view("M8[us]")) + tm.assert_index_equal(dti, expected) + + def test_date_range_non_nano(self): + start = np.datetime64("1066-10-14") # Battle of Hastings + end = np.datetime64("2305-07-13") # Jean-Luc Picard's birthday + + dti = date_range(start, end, freq="D", unit="s") + assert dti.freq == "D" + assert dti.dtype == "M8[s]" + + exp = np.arange( + start.astype("M8[s]").view("i8"), + (end + 1).astype("M8[s]").view("i8"), + 24 * 3600, + ).view("M8[s]") + + tm.assert_numpy_array_equal(dti.to_numpy(), exp) + + +class TestDateRangeNonTickFreq: + # Tests revolving around less-common (non-Tick) `freq` keywords. + + def test_date_range_custom_business_month_begin(self, unit): + hcal = USFederalHolidayCalendar() + freq = offsets.CBMonthBegin(calendar=hcal) + dti = date_range(start="20120101", end="20130101", freq=freq, unit=unit) + assert all(freq.is_on_offset(x) for x in dti) + + expected = DatetimeIndex( + [ + "2012-01-03", + "2012-02-01", + "2012-03-01", + "2012-04-02", + "2012-05-01", + "2012-06-01", + "2012-07-02", + "2012-08-01", + "2012-09-04", + "2012-10-01", + "2012-11-01", + "2012-12-03", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(dti, expected) + + def test_date_range_custom_business_month_end(self, unit): + hcal = USFederalHolidayCalendar() + freq = offsets.CBMonthEnd(calendar=hcal) + dti = date_range(start="20120101", end="20130101", freq=freq, unit=unit) + assert all(freq.is_on_offset(x) for x in dti) + + expected = DatetimeIndex( + [ + "2012-01-31", + "2012-02-29", + "2012-03-30", + "2012-04-30", + "2012-05-31", + "2012-06-29", + "2012-07-31", + "2012-08-31", + "2012-09-28", + "2012-10-31", + "2012-11-30", + "2012-12-31", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(dti, expected) + + def test_date_range_with_custom_holidays(self, unit): + # GH#30593 + freq = offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = date_range(start="2020-11-25 15:00", periods=4, freq=freq, unit=unit) + expected = DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(result, expected) + + def test_date_range_businesshour(self, unit): + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + ["2014-07-04 16:00", "2014-07-07 09:00"], dtype=f"M8[{unit}]", freq="bh" + ) + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + def test_date_range_business_hour2(self, unit): + idx1 = date_range( + start="2014-07-04 15:00", end="2014-07-08 10:00", freq="bh", unit=unit + ) + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="bh", unit=unit) + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="bh", unit=unit) + expected = DatetimeIndex( + [ + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + tm.assert_index_equal(idx1, expected) + tm.assert_index_equal(idx2, expected) + tm.assert_index_equal(idx3, expected) + + idx4 = date_range( + start="2014-07-04 15:45", end="2014-07-08 10:45", freq="bh", unit=unit + ) + idx5 = date_range(start="2014-07-04 15:45", periods=12, freq="bh", unit=unit) + idx6 = date_range(end="2014-07-08 10:45", periods=12, freq="bh", unit=unit) + + expected2 = expected + Timedelta(minutes=45).as_unit(unit) + expected2.freq = "bh" + tm.assert_index_equal(idx4, expected2) + tm.assert_index_equal(idx5, expected2) + tm.assert_index_equal(idx6, expected2) + + def test_date_range_business_hour_short(self, unit): + # GH#49835 + idx4 = date_range(start="2014-07-01 10:00", freq="bh", periods=1, unit=unit) + expected4 = DatetimeIndex(["2014-07-01 10:00"], dtype=f"M8[{unit}]", freq="bh") + tm.assert_index_equal(idx4, expected4) + + def test_date_range_year_start(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="YS", unit=unit) + exp = DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], + dtype=f"M8[{unit}]", + freq="YS", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_year_end(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="YE", unit=unit) + exp = DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], + dtype=f"M8[{unit}]", + freq="YE", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_negative_freq_year_end(self, unit): + # GH#11018 + rng = date_range("2011-12-31", freq="-2YE", periods=3, unit=unit) + exp = DatetimeIndex( + ["2011-12-31", "2009-12-31", "2007-12-31"], dtype=f"M8[{unit}]", freq="-2YE" + ) + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2YE" + + def test_date_range_business_year_end_year(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="BYE", unit=unit) + exp = DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], + dtype=f"M8[{unit}]", + freq="BYE", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_bms(self, unit): + # GH#1645 + result = date_range("1/1/2000", periods=10, freq="BMS", unit=unit) + + expected = DatetimeIndex( + [ + "2000-01-03", + "2000-02-01", + "2000-03-01", + "2000-04-03", + "2000-05-01", + "2000-06-01", + "2000-07-03", + "2000-08-01", + "2000-09-01", + "2000-10-02", + ], + dtype=f"M8[{unit}]", + freq="BMS", + ) + tm.assert_index_equal(result, expected) + + def test_date_range_semi_month_begin(self, unit): + dates = [ + datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15), + ] + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SMS", unit=unit) + exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SMS") + tm.assert_index_equal(result, exp) + + def test_date_range_semi_month_end(self, unit): + dates = [ + datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31), + ] + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SME", unit=unit) + exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SME") + tm.assert_index_equal(result, exp) + + def test_date_range_week_of_month(self, unit): + # GH#20517 + # Note the start here is not on_offset for this freq + result = date_range(start="20110101", periods=1, freq="WOM-1MON", unit=unit) + expected = DatetimeIndex(["2011-01-03"], dtype=f"M8[{unit}]", freq="WOM-1MON") + tm.assert_index_equal(result, expected) + + result2 = date_range(start="20110101", periods=2, freq="WOM-1MON", unit=unit) + expected2 = DatetimeIndex( + ["2011-01-03", "2011-02-07"], dtype=f"M8[{unit}]", freq="WOM-1MON" + ) + tm.assert_index_equal(result2, expected2) + + def test_date_range_week_of_month2(self, unit): + # GH#5115, GH#5348 + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT", unit=unit) + expected = DatetimeIndex( + ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"], + dtype=f"M8[{unit}]", + freq="WOM-1SAT", + ) + tm.assert_index_equal(result, expected) + + def test_date_range_negative_freq_month_end(self, unit): + # GH#11018 + rng = date_range("2011-01-31", freq="-2ME", periods=3, unit=unit) + exp = DatetimeIndex( + ["2011-01-31", "2010-11-30", "2010-09-30"], dtype=f"M8[{unit}]", freq="-2ME" + ) + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2ME" + + def test_date_range_fy5253(self, unit): + freq = offsets.FY5253(startingMonth=1, weekday=3, variation="nearest") + dti = date_range( + start="2013-01-01", + periods=2, + freq=freq, + unit=unit, + ) + expected = DatetimeIndex( + ["2013-01-31", "2014-01-30"], dtype=f"M8[{unit}]", freq=freq + ) + + tm.assert_index_equal(dti, expected) + + @pytest.mark.parametrize( + "freqstr,offset", + [ + ("QS", offsets.QuarterBegin(startingMonth=1)), + ("BQE", offsets.BQuarterEnd(startingMonth=12)), + ("W-SUN", offsets.Week(weekday=6)), + ], + ) + def test_date_range_freqstr_matches_offset(self, freqstr, offset): + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + + idx1 = date_range(start=sdate, end=edate, freq=freqstr) + idx2 = date_range(start=sdate, end=edate, freq=offset) + assert len(idx1) == len(idx2) + assert idx1.freq == idx2.freq diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_datetime.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_datetime.py new file mode 100644 index 0000000000000000000000000000000000000000..f7fc64d4b01633edc011349441b1f75dd2f00cb9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_datetime.py @@ -0,0 +1,216 @@ +import datetime as dt +from datetime import date +import re + +import numpy as np +import pytest + +from pandas.compat.numpy import np_long + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Timestamp, + date_range, + offsets, +) +import pandas._testing as tm + + +class TestDatetimeIndex: + def test_is_(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") + assert dti.is_(dti) + assert dti.is_(dti.view()) + assert not dti.is_(dti.copy()) + + def test_time_overflow_for_32bit_machines(self): + # GH8943. On some machines NumPy defaults to np.int32 (for example, + # 32-bit Linux machines). In the function _generate_regular_range + # found in tseries/index.py, `periods` gets multiplied by `strides` + # (which has value 1e9) and since the max value for np.int32 is ~2e9, + # and since those machines won't promote np.int32 to np.int64, we get + # overflow. + periods = np_long(1000) + + idx1 = date_range(start="2000", periods=periods, freq="s") + assert len(idx1) == periods + + idx2 = date_range(end="2000", periods=periods, freq="s") + assert len(idx2) == periods + + def test_nat(self): + assert DatetimeIndex([np.nan])[0] is pd.NaT + + def test_week_of_month_frequency(self): + # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise + d1 = date(2002, 9, 1) + d2 = date(2013, 10, 27) + d3 = date(2012, 9, 30) + idx1 = DatetimeIndex([d1, d2]) + idx2 = DatetimeIndex([d3]) + result_append = idx1.append(idx2) + expected = DatetimeIndex([d1, d2, d3]) + tm.assert_index_equal(result_append, expected) + result_union = idx1.union(idx2) + expected = DatetimeIndex([d1, d3, d2]) + tm.assert_index_equal(result_union, expected) + + def test_append_nondatetimeindex(self): + rng = date_range("1/1/2000", periods=10) + idx = Index(["a", "b", "c", "d"]) + + result = rng.append(idx) + assert isinstance(result[0], Timestamp) + + def test_misc_coverage(self): + rng = date_range("1/1/2000", periods=5) + result = rng.groupby(rng.day) + assert isinstance(next(iter(result.values()))[0], Timestamp) + + # TODO: belongs in frame groupby tests? + def test_groupby_function_tuple_1677(self): + df = DataFrame( + np.random.default_rng(2).random(100), + index=date_range("1/1/2000", periods=100), + ) + monthly_group = df.groupby(lambda x: (x.year, x.month)) + + result = monthly_group.mean() + assert isinstance(result.index[0], tuple) + + def assert_index_parameters(self, index): + assert index.freq == "40960ns" + assert index.inferred_freq == "40960ns" + + def test_ns_index(self): + nsamples = 400 + ns = int(1e9 / 24414) + dtstart = np.datetime64("2012-09-20T00:00:00") + + dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, "ns") + freq = ns * offsets.Nano() + index = DatetimeIndex(dt, freq=freq, name="time") + self.assert_index_parameters(index) + + new_index = date_range(start=index[0], end=index[-1], freq=index.freq) + self.assert_index_parameters(new_index) + + def test_asarray_tz_naive(self): + # This shouldn't produce a warning. + idx = date_range("2000", periods=2) + # M8[ns] by default + result = np.asarray(idx) + + expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + # optionally, object + result = np.asarray(idx, dtype=object) + + expected = np.array([Timestamp("2000-01-01"), Timestamp("2000-01-02")]) + tm.assert_numpy_array_equal(result, expected) + + def test_asarray_tz_aware(self): + tz = "US/Central" + idx = date_range("2000", periods=2, tz=tz) + expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") + result = np.asarray(idx, dtype="datetime64[ns]") + + tm.assert_numpy_array_equal(result, expected) + + # Old behavior with no warning + result = np.asarray(idx, dtype="M8[ns]") + + tm.assert_numpy_array_equal(result, expected) + + # Future behavior with no warning + expected = np.array( + [Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)] + ) + result = np.asarray(idx, dtype=object) + + tm.assert_numpy_array_equal(result, expected) + + def test_CBH_deprecated(self): + msg = "'CBH' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" + ) + result = DatetimeIndex( + [ + "2022-12-12 09:00:00", + "2022-12-12 10:00:00", + "2022-12-12 11:00:00", + "2022-12-12 12:00:00", + "2022-12-12 13:00:00", + "2022-12-12 14:00:00", + "2022-12-12 15:00:00", + "2022-12-12 16:00:00", + ], + dtype="datetime64[ns]", + freq="cbh", + ) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr, expected_values, expected_freq", + [ + ( + "AS-AUG", + ["2021-08-01", "2022-08-01", "2023-08-01"], + "YS-AUG", + ), + ( + "1BAS-MAY", + ["2021-05-03", "2022-05-02", "2023-05-01"], + "1BYS-MAY", + ), + ], + ) + def test_AS_BAS_deprecated(self, freq_depr, expected_values, expected_freq): + # GH#55479 + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2020, 12, 1), dt.datetime(2023, 12, 1), freq=freq_depr + ) + result = DatetimeIndex( + expected_values, + dtype="datetime64[ns]", + freq=expected_freq, + ) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, expected_values, freq_depr", + [ + ("2BYE-MAR", ["2016-03-31"], "2BA-MAR"), + ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), + ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), + ("2BQE", ["2016-03-31"], "2BQ"), + ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), + ], + ) + def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): + # GH#52064 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) + result = DatetimeIndex( + data=expected_values, + dtype="datetime64[ns]", + freq=freq, + ) + + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_formats.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..b52eed8c509c6e655425eb5b9be3351f369fee4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_formats.py @@ -0,0 +1,356 @@ +from datetime import datetime + +import dateutil.tz +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import ( + DatetimeIndex, + NaT, + Series, +) +import pandas._testing as tm + + +@pytest.fixture(params=["s", "ms", "us", "ns"]) +def unit(request): + return request.param + + +def test_get_values_for_csv(): + index = pd.date_range(freq="1D", periods=3, start="2017-01-01") + + # First, with no arguments. + expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) + + result = index._get_values_for_csv() + tm.assert_numpy_array_equal(result, expected) + + # No NaN values, so na_rep has no effect + result = index._get_values_for_csv(na_rep="pandas") + tm.assert_numpy_array_equal(result, expected) + + # Make sure date formatting works + expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) + + result = index._get_values_for_csv(date_format="%m-%Y-%d") + tm.assert_numpy_array_equal(result, expected) + + # NULL object handling should work + index = DatetimeIndex(["2017-01-01", NaT, "2017-01-03"]) + expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) + + result = index._get_values_for_csv(na_rep="NaT") + tm.assert_numpy_array_equal(result, expected) + + expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) + + result = index._get_values_for_csv(na_rep="pandas") + tm.assert_numpy_array_equal(result, expected) + + result = index._get_values_for_csv(na_rep="NaT", date_format="%Y-%m-%d %H:%M:%S.%f") + expected = np.array( + ["2017-01-01 00:00:00.000000", "NaT", "2017-01-03 00:00:00.000000"], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + # invalid format + result = index._get_values_for_csv(na_rep="NaT", date_format="foo") + expected = np.array(["foo", "NaT", "foo"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +class TestDatetimeIndexRendering: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_with_timezone_repr(self, tzstr): + rng = pd.date_range("4/13/2010", "5/6/2010") + + rng_eastern = rng.tz_localize(tzstr) + + rng_repr = repr(rng_eastern) + assert "2010-04-13 00:00:00" in rng_repr + + def test_dti_repr_dates(self): + text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) + assert "['2013-01-01'," in text + assert ", '2014-01-01']" in text + + def test_dti_repr_mixed(self): + text = str( + pd.to_datetime( + [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] + ) + ) + assert "'2013-01-01 00:00:00'," in text + assert "'2014-01-01 00:00:00']" in text + + def test_dti_repr_short(self): + dr = pd.date_range(start="1/1/2012", periods=1) + repr(dr) + + dr = pd.date_range(start="1/1/2012", periods=2) + repr(dr) + + dr = pd.date_range(start="1/1/2012", periods=3) + repr(dr) + + @pytest.mark.parametrize( + "dates, freq, expected_repr", + [ + ( + ["2012-01-01 00:00:00"], + "60min", + ( + "DatetimeIndex(['2012-01-01 00:00:00'], " + "dtype='datetime64[ns]', freq='60min')" + ), + ), + ( + ["2012-01-01 00:00:00", "2012-01-01 01:00:00"], + "60min", + "DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 01:00:00'], " + "dtype='datetime64[ns]', freq='60min')", + ), + ( + ["2012-01-01"], + "24h", + "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24h')", + ), + ], + ) + def test_dti_repr_time_midnight(self, dates, freq, expected_repr, unit): + # GH53634 + dti = DatetimeIndex(dates, freq).as_unit(unit) + actual_repr = repr(dti) + assert actual_repr == expected_repr.replace("[ns]", f"[{unit}]") + + def test_dti_representation(self, unit): + idxs = [] + idxs.append(DatetimeIndex([], freq="D")) + idxs.append(DatetimeIndex(["2011-01-01"], freq="D")) + idxs.append(DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D")) + idxs.append(DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D")) + idxs.append( + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="h", + tz="Asia/Tokyo", + ) + ) + idxs.append( + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" + ) + ) + idxs.append( + DatetimeIndex(["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="UTC") + ) + + exp = [] + exp.append("DatetimeIndex([], dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D')") + exp.append( + "DatetimeIndex(['2011-01-01', '2011-01-02'], " + "dtype='datetime64[ns]', freq='D')" + ) + exp.append( + "DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='datetime64[ns]', freq='D')" + ) + exp.append( + "DatetimeIndex(['2011-01-01 09:00:00+09:00', " + "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='h')" + ) + exp.append( + "DatetimeIndex(['2011-01-01 09:00:00-05:00', " + "'2011-01-01 10:00:00-05:00', 'NaT'], " + "dtype='datetime64[ns, US/Eastern]', freq=None)" + ) + exp.append( + "DatetimeIndex(['2011-01-01 09:00:00+00:00', " + "'2011-01-01 10:00:00+00:00', 'NaT'], " + "dtype='datetime64[ns, UTC]', freq=None)" + "" + ) + + with pd.option_context("display.width", 300): + for index, expected in zip(idxs, exp): + index = index.as_unit(unit) + expected = expected.replace("[ns", f"[{unit}") + result = repr(index) + assert result == expected + result = str(index) + assert result == expected + + # TODO: this is a Series.__repr__ test + def test_dti_representation_to_series(self, unit): + idx1 = DatetimeIndex([], freq="D") + idx2 = DatetimeIndex(["2011-01-01"], freq="D") + idx3 = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="h", + tz="Asia/Tokyo", + ) + idx6 = DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" + ) + idx7 = DatetimeIndex(["2011-01-01 09:00", "2011-01-02 10:15"]) + + exp1 = """Series([], dtype: datetime64[ns])""" + + exp2 = "0 2011-01-01\ndtype: datetime64[ns]" + + exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" + + exp4 = ( + "0 2011-01-01\n" + "1 2011-01-02\n" + "2 2011-01-03\n" + "dtype: datetime64[ns]" + ) + + exp5 = ( + "0 2011-01-01 09:00:00+09:00\n" + "1 2011-01-01 10:00:00+09:00\n" + "2 2011-01-01 11:00:00+09:00\n" + "dtype: datetime64[ns, Asia/Tokyo]" + ) + + exp6 = ( + "0 2011-01-01 09:00:00-05:00\n" + "1 2011-01-01 10:00:00-05:00\n" + "2 NaT\n" + "dtype: datetime64[ns, US/Eastern]" + ) + + exp7 = ( + "0 2011-01-01 09:00:00\n" + "1 2011-01-02 10:15:00\n" + "dtype: datetime64[ns]" + ) + + with pd.option_context("display.width", 300): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7], + ): + ser = Series(idx.as_unit(unit)) + result = repr(ser) + assert result == expected.replace("[ns", f"[{unit}") + + def test_dti_summary(self): + # GH#9116 + idx1 = DatetimeIndex([], freq="D") + idx2 = DatetimeIndex(["2011-01-01"], freq="D") + idx3 = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="h", + tz="Asia/Tokyo", + ) + idx6 = DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" + ) + + exp1 = "DatetimeIndex: 0 entries\nFreq: D" + + exp2 = "DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\nFreq: D" + + exp3 = "DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\nFreq: D" + + exp4 = "DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\nFreq: D" + + exp5 = ( + "DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " + "to 2011-01-01 11:00:00+09:00\n" + "Freq: h" + ) + + exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6], [exp1, exp2, exp3, exp4, exp5, exp6] + ): + result = idx._summary() + assert result == expected + + @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_repr_etc_smoke(self, tz, freq): + # only really care that it works + dti = pd.bdate_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), tz=tz, freq=freq + ) + repr(dti) + dti._summary() + dti[2:2]._summary() + + +class TestFormat: + def test_format(self): + # GH#35439 + idx = pd.date_range("20130101", periods=5) + expected = [f"{x:%Y-%m-%d}" for x in idx] + msg = r"DatetimeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected + + def test_format_with_name_time_info(self): + # bug I fixed 12/20/2011 + dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") + + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dates.format(name=True) + assert formatted[0] == "something" + + def test_format_datetime_with_time(self): + dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) + + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dti.format() + expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] + assert len(result) == 2 + assert result == expected + + def test_format_datetime(self): + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() + assert formatted[0] == "2003-01-01 12:00:00" + assert formatted[1] == "NaT" + + def test_format_date(self): + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() + assert formatted[0] == "2003-01-01" + assert formatted[1] == "NaT" + + def test_format_date_tz(self): + dti = pd.to_datetime([datetime(2013, 1, 1)], utc=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + dti = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + def test_format_date_explicit_date_format(self): + dti = pd.to_datetime([datetime(2003, 2, 1), NaT]) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") + assert formatted[0] == "02-01-2003" + assert formatted[1] == "UT" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_freq_attr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_freq_attr.py new file mode 100644 index 0000000000000000000000000000000000000000..5cddf56cd1c73b3c00d8b59c6f99095ba9a704fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_freq_attr.py @@ -0,0 +1,61 @@ +import pytest + +from pandas import ( + DatetimeIndex, + date_range, +) + +from pandas.tseries.offsets import ( + BDay, + DateOffset, + Day, + Hour, +) + + +class TestFreq: + def test_freq_setter_errors(self): + # GH#20678 + idx = DatetimeIndex(["20180101", "20180103", "20180105"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" + + @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48h", Hour(48)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_freq_setter(self, values, freq, tz): + # GH#20678 + idx = DatetimeIndex(values, tz=tz) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, DateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_freq_view_safe(self): + # Setting the freq for one DatetimeIndex shouldn't alter the freq + # for another that views the same data + + dti = date_range("2016-01-01", periods=5) + dta = dti._data + + dti2 = DatetimeIndex(dta)._with_freq(None) + assert dti2.freq is None + + # Original was not altered + assert dti.freq == "D" + assert dta.freq == "D" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..bfbcdcff51ee6e7f50325962a44209a5c5bf9653 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_indexing.py @@ -0,0 +1,717 @@ +from datetime import ( + date, + datetime, + time, + timedelta, +) + +import numpy as np +import pytest + +from pandas._libs import index as libindex +from pandas.compat.numpy import np_long + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Timestamp, + bdate_range, + date_range, + notna, +) +import pandas._testing as tm + +from pandas.tseries.frequencies import to_offset + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestGetItem: + def test_getitem_slice_keeps_name(self): + # GH4226 + st = Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") + et = Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") + dr = date_range(st, et, freq="h", name="timebucket") + assert dr[1:].name == dr.name + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_getitem(self, tz): + idx = date_range("2011-01-01", "2011-01-31", freq="D", tz=tz, name="idx") + + result = idx[0] + assert result == Timestamp("2011-01-01", tz=idx.tz) + + result = idx[0:5] + expected = date_range( + "2011-01-01", "2011-01-05", freq="D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[0:10:2] + expected = date_range( + "2011-01-01", "2011-01-09", freq="2D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[-20:-5:3] + expected = date_range( + "2011-01-12", "2011-01-24", freq="3D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[4::-1] + expected = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + dtype=idx.dtype, + freq="-1D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_getitem(self, freq): + rng = bdate_range(START, END, freq=freq) + smaller = rng[:5] + exp = DatetimeIndex(rng.view(np.ndarray)[:5], freq=freq) + tm.assert_index_equal(smaller, exp) + assert smaller.freq == exp.freq + assert smaller.freq == rng.freq + + sliced = rng[::5] + assert sliced.freq == to_offset(freq) * 5 + + fancy_indexed = rng[[4, 3, 2, 1, 0]] + assert len(fancy_indexed) == 5 + assert isinstance(fancy_indexed, DatetimeIndex) + assert fancy_indexed.freq is None + + # 32-bit vs. 64-bit platforms + assert rng[4] == rng[np_long(4)] + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_getitem_matplotlib_hackaround(self, freq): + rng = bdate_range(START, END, freq=freq) + with pytest.raises(ValueError, match="Multi-dimensional indexing"): + # GH#30588 multi-dimensional indexing deprecated + rng[:, None] + + def test_getitem_int_list(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") + dti2 = dti[[1, 3, 5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + assert v1 == Timestamp("2/28/2005") + assert v2 == Timestamp("4/30/2005") + assert v3 == Timestamp("6/30/2005") + + # getitem with non-slice drops freq + assert dti2.freq is None + + +class TestWhere: + def test_where_doesnt_retain_freq(self): + dti = date_range("20130101", periods=3, freq="D", name="idx") + cond = [True, True, False] + expected = DatetimeIndex([dti[0], dti[1], dti[0]], freq=None, name="idx") + + result = dti.where(cond, dti[::-1]) + tm.assert_index_equal(result, expected) + + def test_where_other(self): + # other is ndarray or Index + i = date_range("20130101", periods=3, tz="US/Eastern") + + for arr in [np.nan, pd.NaT]: + result = i.where(notna(i), other=arr) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notna(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notna(i2), i2._values) + tm.assert_index_equal(result, i2) + + def test_where_invalid_dtypes(self): + dti = date_range("20130101", periods=3, tz="US/Eastern") + + tail = dti[2:].tolist() + i2 = Index([pd.NaT, pd.NaT] + tail) + + mask = notna(i2) + + # passing tz-naive ndarray to tzaware DTI + result = dti.where(mask, i2.values) + expected = Index([pd.NaT.asm8, pd.NaT.asm8] + tail, dtype=object) + tm.assert_index_equal(result, expected) + + # passing tz-aware DTI to tznaive DTI + naive = dti.tz_localize(None) + result = naive.where(mask, i2) + expected = Index([i2[0], i2[1]] + naive[2:].tolist(), dtype=object) + tm.assert_index_equal(result, expected) + + pi = i2.tz_localize(None).to_period("D") + result = dti.where(mask, pi) + expected = Index([pi[0], pi[1]] + tail, dtype=object) + tm.assert_index_equal(result, expected) + + tda = i2.asi8.view("timedelta64[ns]") + result = dti.where(mask, tda) + expected = Index([tda[0], tda[1]] + tail, dtype=object) + assert isinstance(expected[0], np.timedelta64) + tm.assert_index_equal(result, expected) + + result = dti.where(mask, i2.asi8) + expected = Index([pd.NaT._value, pd.NaT._value] + tail, dtype=object) + assert isinstance(expected[0], int) + tm.assert_index_equal(result, expected) + + # non-matching scalar + td = pd.Timedelta(days=4) + result = dti.where(mask, td) + expected = Index([td, td] + tail, dtype=object) + assert expected[0] is td + tm.assert_index_equal(result, expected) + + def test_where_mismatched_nat(self, tz_aware_fixture): + tz = tz_aware_fixture + dti = date_range("2013-01-01", periods=3, tz=tz) + cond = np.array([True, False, True]) + + tdnat = np.timedelta64("NaT", "ns") + expected = Index([dti[0], tdnat, dti[2]], dtype=object) + assert expected[1] is tdnat + + result = dti.where(cond, tdnat) + tm.assert_index_equal(result, expected) + + def test_where_tz(self): + i = date_range("20130101", periods=3, tz="US/Eastern") + result = i.where(notna(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notna(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + + +class TestTake: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_take_dont_lose_meta(self, tzstr): + rng = date_range("1/1/2000", periods=20, tz=tzstr) + + result = rng.take(range(5)) + assert result.tz == rng.tz + assert result.freq == rng.freq + + def test_take_nan_first_datetime(self): + index = DatetimeIndex([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) + result = index.take([-1, 0, 1]) + expected = DatetimeIndex([index[-1], index[0], index[1]]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_take(self, tz): + # GH#10295 + idx = date_range("2011-01-01", "2011-01-31", freq="D", name="idx", tz=tz) + + result = idx.take([0]) + assert result == Timestamp("2011-01-01", tz=idx.tz) + + result = idx.take([0, 1, 2]) + expected = date_range( + "2011-01-01", "2011-01-03", freq="D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = date_range( + "2011-01-01", "2011-01-05", freq="2D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([7, 4, 1]) + expected = date_range( + "2011-01-08", "2011-01-02", freq="-3D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([3, 2, 5]) + expected = DatetimeIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, + freq=None, + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq is None + + result = idx.take([-3, 2, 5]) + expected = DatetimeIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, + freq=None, + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_take_invalid_kwargs(self): + idx = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + # TODO: This method came from test_datetime; de-dup with version above + @pytest.mark.parametrize("tz", [None, "US/Eastern", "Asia/Tokyo"]) + def test_take2(self, tz): + dates = [ + datetime(2010, 1, 1, 14), + datetime(2010, 1, 1, 15), + datetime(2010, 1, 1, 17), + datetime(2010, 1, 1, 21), + ] + + idx = date_range( + start="2010-01-01 09:00", + end="2010-02-01 09:00", + freq="h", + tz=tz, + name="idx", + ) + expected = DatetimeIndex(dates, freq=None, name="idx", dtype=idx.dtype) + + taken1 = idx.take([5, 6, 8, 12]) + taken2 = idx[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, DatetimeIndex) + assert taken.freq is None + assert taken.tz == expected.tz + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH#12631 + idx = DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + def test_take_fill_value_with_timezone(self): + idx = DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + result = idx.take(np.array([1, 0, -1])) + expected = DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = DatetimeIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestGetLoc: + def test_get_loc_key_unit_mismatch(self): + idx = date_range("2000-01-01", periods=3) + key = idx[1].as_unit("ms") + loc = idx.get_loc(key) + assert loc == 1 + assert key in idx + + def test_get_loc_key_unit_mismatch_not_castable(self): + dta = date_range("2000-01-01", periods=3)._data.astype("M8[s]") + dti = DatetimeIndex(dta) + key = dta[0].as_unit("ns") + pd.Timedelta(1) + + with pytest.raises( + KeyError, match=r"Timestamp\('2000-01-01 00:00:00.000000001'\)" + ): + dti.get_loc(key) + + assert key not in dti + + def test_get_loc_time_obj(self): + # time indexing + idx = date_range("2000-01-01", periods=24, freq="h") + + result = idx.get_loc(time(12)) + expected = np.array([12]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + result = idx.get_loc(time(12, 30)) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("offset", [-10, 10]) + def test_get_loc_time_obj2(self, monkeypatch, offset): + # GH#8667 + size_cutoff = 50 + n = size_cutoff + offset + key = time(15, 11, 30) + start = key.hour * 3600 + key.minute * 60 + key.second + step = 24 * 3600 + + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + idx = date_range("2014-11-26", periods=n, freq="s") + ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx) + locs = np.arange(start, n, step, dtype=np.intp) + + result = ts.index.get_loc(key) + tm.assert_numpy_array_equal(result, locs) + tm.assert_series_equal(ts[key], ts.iloc[locs]) + + left, right = ts.copy(), ts.copy() + left[key] *= -10 + right.iloc[locs] *= -10 + tm.assert_series_equal(left, right) + + def test_get_loc_time_nat(self): + # GH#35114 + # Case where key's total microseconds happens to match iNaT % 1e6 // 1000 + tic = time(minute=12, second=43, microsecond=145224) + dti = DatetimeIndex([pd.NaT]) + + loc = dti.get_loc(tic) + expected = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(loc, expected) + + def test_get_loc_nat(self): + # GH#20464 + index = DatetimeIndex(["1/3/2000", "NaT"]) + assert index.get_loc(pd.NaT) == 1 + + assert index.get_loc(None) == 1 + + assert index.get_loc(np.nan) == 1 + + assert index.get_loc(pd.NA) == 1 + + assert index.get_loc(np.datetime64("NaT")) == 1 + + with pytest.raises(KeyError, match="NaT"): + index.get_loc(np.timedelta64("NaT")) + + @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) + def test_get_loc_timedelta_invalid_key(self, key): + # GH#20464 + dti = date_range("1970-01-01", periods=10) + msg = "Cannot index DatetimeIndex with [Tt]imedelta" + with pytest.raises(TypeError, match=msg): + dti.get_loc(key) + + def test_get_loc_reasonable_key_error(self): + # GH#1062 + index = DatetimeIndex(["1/3/2000"]) + with pytest.raises(KeyError, match="2000"): + index.get_loc("1/1/2000") + + def test_get_loc_year_str(self): + rng = date_range("1/1/2000", "1/1/2010") + + result = rng.get_loc("2009") + expected = slice(3288, 3653) + assert result == expected + + +class TestContains: + def test_dti_contains_with_duplicates(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + assert d in ix + + @pytest.mark.parametrize( + "vals", + [ + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["2015", "2015", "2016"], + ["2015", "2015", "2014"], + ], + ) + def test_contains_nonunique(self, vals): + # GH#9512 + idx = DatetimeIndex(vals) + assert idx[0] in idx + + +class TestGetIndexer: + def test_get_indexer_date_objs(self): + rng = date_range("1/1/2000", periods=20) + + result = rng.get_indexer(rng.map(lambda x: x.date())) + expected = rng.get_indexer(rng) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer(self): + idx = date_range("2000-01-01", periods=3) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) + + target = idx[0] + pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 hour")), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour").to_timedelta64(), + ] + tm.assert_numpy_array_equal( + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + "foo", + ] + msg = "Could not convert 'foo' to NumPy timedelta" + with pytest.raises(ValueError, match=msg): + idx.get_indexer(target, "nearest", tolerance=tol_bad) + with pytest.raises(ValueError, match="abbreviation w/o a number"): + idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") + + @pytest.mark.parametrize( + "target", + [ + [date(2020, 1, 1), Timestamp("2020-01-02")], + [Timestamp("2020-01-01"), date(2020, 1, 2)], + ], + ) + def test_get_indexer_mixed_dtypes(self, target): + # https://github.com/pandas-dev/pandas/issues/33741 + values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) + result = values.get_indexer(target) + expected = np.array([0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "target, positions", + [ + ([date(9999, 1, 1), Timestamp("2020-01-01")], [-1, 0]), + ([Timestamp("2020-01-01"), date(9999, 1, 1)], [0, -1]), + ([date(9999, 1, 1), date(9999, 1, 1)], [-1, -1]), + ], + ) + def test_get_indexer_out_of_bounds_date(self, target, positions): + values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) + + result = values.get_indexer(target) + expected = np.array(positions, dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_pad_requires_monotonicity(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] + + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + rng2.get_indexer(rng, method="pad") + + +class TestMaybeCastSliceBound: + def test_maybe_cast_slice_bounds_empty(self): + # GH#14354 + empty_idx = date_range(freq="1h", periods=0, end="2015") + + right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right") + exp = Timestamp("2015-01-02 23:59:59.999999999") + assert right == exp + + left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left") + exp = Timestamp("2015-01-02 00:00:00") + assert left == exp + + def test_maybe_cast_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = DatetimeIndex(["2017", "2017"]) + result = idx._maybe_cast_slice_bound("2017-01-01", "left") + expected = Timestamp("2017-01-01") + assert result == expected + + +class TestGetSliceBounds: + @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) + def test_get_slice_bounds_datetime_within( + self, box, side, expected, tz_aware_fixture + ): + # GH 35690 + tz = tz_aware_fixture + index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz) + key = box(year=2000, month=1, day=7) + + if tz is not None: + with pytest.raises(TypeError, match="Cannot compare tz-naive"): + # GH#36148 we require tzawareness-compat as of 2.0 + index.get_slice_bound(key, side=side) + else: + result = index.get_slice_bound(key, side=side) + assert result == expected + + @pytest.mark.parametrize("box", [datetime, Timestamp]) + @pytest.mark.parametrize("side", ["left", "right"]) + @pytest.mark.parametrize("year, expected", [(1999, 0), (2020, 30)]) + def test_get_slice_bounds_datetime_outside( + self, box, side, year, expected, tz_aware_fixture + ): + # GH 35690 + tz = tz_aware_fixture + index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz) + key = box(year=year, month=1, day=7) + + if tz is not None: + with pytest.raises(TypeError, match="Cannot compare tz-naive"): + # GH#36148 we require tzawareness-compat as of 2.0 + index.get_slice_bound(key, side=side) + else: + result = index.get_slice_bound(key, side=side) + assert result == expected + + @pytest.mark.parametrize("box", [datetime, Timestamp]) + def test_slice_datetime_locs(self, box, tz_aware_fixture): + # GH 34077 + tz = tz_aware_fixture + index = DatetimeIndex(["2010-01-01", "2010-01-03"]).tz_localize(tz) + key = box(2010, 1, 1) + + if tz is not None: + with pytest.raises(TypeError, match="Cannot compare tz-naive"): + # GH#36148 we require tzawareness-compat as of 2.0 + index.slice_locs(key, box(2010, 1, 2)) + else: + result = index.slice_locs(key, box(2010, 1, 2)) + expected = (0, 1) + assert result == expected + + +class TestIndexerBetweenTime: + def test_indexer_between_time(self): + # GH#11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" + with pytest.raises(ValueError, match=msg): + rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + @pytest.mark.parametrize("unit", ["us", "ms", "s"]) + def test_indexer_between_time_non_nano(self, unit): + # For simple cases like this, the non-nano indexer_between_time + # should match the nano result + + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + arr_nano = rng._data._ndarray + + arr = arr_nano.astype(f"M8[{unit}]") + + dta = type(rng._data)._simple_new(arr, dtype=arr.dtype) + dti = DatetimeIndex(dta) + assert dti.dtype == arr.dtype + + tic = time(1, 25) + toc = time(2, 29) + + result = dti.indexer_between_time(tic, toc) + expected = rng.indexer_between_time(tic, toc) + tm.assert_numpy_array_equal(result, expected) + + # case with non-zero micros in arguments + tic = time(1, 25, 0, 45678) + toc = time(2, 29, 0, 1234) + + result = dti.indexer_between_time(tic, toc) + expected = rng.indexer_between_time(tic, toc) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_iter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_iter.py new file mode 100644 index 0000000000000000000000000000000000000000..a006ed79f27baed75bedb95e6f24e948e429172e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_iter.py @@ -0,0 +1,76 @@ +import dateutil.tz +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + date_range, + to_datetime, +) +from pandas.core.arrays import datetimes + + +class TestDatetimeIndexIteration: + @pytest.mark.parametrize( + "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] + ) + def test_iteration_preserves_nanoseconds(self, tz): + # GH#19603 + index = DatetimeIndex( + ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz + ) + for i, ts in enumerate(index): + assert ts == index[i] # pylint: disable=unnecessary-list-index-lookup + + def test_iter_readonly(self): + # GH#28055 ints_to_pydatetime with readonly array + arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) + arr.setflags(write=False) + dti = to_datetime(arr) + list(dti) + + def test_iteration_preserves_tz(self): + # see GH#8890 + index = date_range("2012-01-01", periods=3, freq="h", tz="US/Eastern") + + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result == expected + + def test_iteration_preserves_tz2(self): + index = date_range( + "2012-01-01", periods=3, freq="h", tz=dateutil.tz.tzoffset(None, -28800) + ) + + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result._repr_base == expected._repr_base + assert result == expected + + def test_iteration_preserves_tz3(self): + # GH#9100 + index = DatetimeIndex( + ["2014-12-01 03:32:39.987000-08:00", "2014-12-01 04:12:34.987000-08:00"] + ) + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result._repr_base == expected._repr_base + assert result == expected + + @pytest.mark.parametrize("offset", [-5, -1, 0, 1]) + def test_iteration_over_chunksize(self, offset, monkeypatch): + # GH#21012 + chunksize = 5 + index = date_range( + "2000-01-01 00:00:00", periods=chunksize - offset, freq="min" + ) + num = 0 + with monkeypatch.context() as m: + m.setattr(datetimes, "_ITER_CHUNKSIZE", chunksize) + for stamp in index: + assert index[num] == stamp + num += 1 + assert num == len(index) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..d0ac32939296c3f6f0fb0ecb501dde9d65ba989d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_join.py @@ -0,0 +1,149 @@ +from datetime import ( + datetime, + timezone, +) + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Timestamp, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + +from pandas.tseries.offsets import ( + BDay, + BMonthEnd, +) + + +class TestJoin: + def test_does_not_convert_mixed_integer(self): + df = DataFrame(np.ones((3, 2)), columns=date_range("2020-01-01", periods=2)) + cols = df.columns.join(df.index, how="outer") + joined = cols.join(df.columns) + assert cols.dtype == np.dtype("O") + assert cols.dtype == joined.dtype + tm.assert_numpy_array_equal(cols.values, joined.values) + + def test_join_self(self, join_type): + index = date_range("1/1/2000", periods=10) + joined = index.join(index, how=join_type) + assert index is joined + + def test_join_with_period_index(self, join_type): + df = DataFrame( + np.ones((10, 2)), + index=date_range("2020-01-01", periods=10), + columns=period_range("2020-01-01", periods=2), + ) + s = df.iloc[:5, 0] + + expected = df.columns.astype("O").join(s.index, how=join_type) + result = df.columns.join(s.index, how=join_type) + tm.assert_index_equal(expected, result) + + def test_join_object_index(self): + rng = date_range("1/1/2000", periods=10) + idx = Index(["a", "b", "c", "d"]) + + result = rng.join(idx, how="outer") + assert isinstance(result[0], Timestamp) + + def test_join_utc_convert(self, join_type): + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") + + left = rng.tz_convert("US/Eastern") + right = rng.tz_convert("Europe/Berlin") + + result = left.join(left[:-5], how=join_type) + assert isinstance(result, DatetimeIndex) + assert result.tz == left.tz + + result = left.join(right[:-5], how=join_type) + assert isinstance(result, DatetimeIndex) + assert result.tz is timezone.utc + + def test_datetimeindex_union_join_empty(self, sort): + dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") + empty = Index([]) + + result = dti.union(empty, sort=sort) + expected = dti.astype("O") + tm.assert_index_equal(result, expected) + + result = dti.join(empty) + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) + + def test_join_nonunique(self): + idx1 = to_datetime(["2012-11-06 16:00:11.477563", "2012-11-06 16:00:11.477563"]) + idx2 = to_datetime(["2012-11-06 15:11:09.006507", "2012-11-06 15:11:09.006507"]) + rs = idx1.join(idx2, how="outer") + assert rs.is_monotonic_increasing + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_outer_join(self, freq): + # should just behave as union + start, end = datetime(2009, 1, 1), datetime(2010, 1, 1) + rng = date_range(start=start, end=end, freq=freq) + + # overlapping + left = rng[:10] + right = rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = rng[:5] + right = rng[10:] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + # non-overlapping, no gap + left = rng[:5] + right = rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # overlapping, but different offset + other = date_range(start, end, freq=BMonthEnd()) + + the_join = rng.join(other, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + def test_naive_aware_conflicts(self): + start, end = datetime(2009, 1, 1), datetime(2010, 1, 1) + naive = date_range(start, end, freq=BDay(), tz=None) + aware = date_range(start, end, freq=BDay(), tz="Asia/Hong_Kong") + + msg = "tz-naive.*tz-aware" + with pytest.raises(TypeError, match=msg): + naive.join(aware) + + with pytest.raises(TypeError, match=msg): + aware.join(naive) + + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_join_preserves_freq(self, tz): + # GH#32157 + dti = date_range("2016-01-01", periods=10, tz=tz) + result = dti[:5].join(dti[5:], how="outer") + assert result.freq == dti.freq + tm.assert_index_equal(result, dti) + + result = dti[:5].join(dti[6:], how="outer") + assert result.freq is None + expected = dti.delete(5) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_npfuncs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_npfuncs.py new file mode 100644 index 0000000000000000000000000000000000000000..6c3e44c2a5db1ebc4f02686d19d34ae3caf1e9ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_npfuncs.py @@ -0,0 +1,13 @@ +import numpy as np + +from pandas import date_range +import pandas._testing as tm + + +class TestSplit: + def test_split_non_utc(self): + # GH#14042 + indices = date_range("2016-01-01 00:00:00+0200", freq="s", periods=10) + result = np.split(indices, indices_or_sections=[])[0] + expected = indices._with_freq(None) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..bac9548b932c163dc7a33282796c1bb682187664 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_ops.py @@ -0,0 +1,56 @@ +from datetime import datetime + +import pytest + +from pandas import ( + DatetimeIndex, + Index, + bdate_range, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexOps: + def test_infer_freq(self, freq_sample): + # GH 11018 + idx = date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) + result = DatetimeIndex(idx.asi8, freq="infer") + tm.assert_index_equal(idx, result) + assert result.freq == freq_sample + + +@pytest.mark.parametrize("freq", ["B", "C"]) +class TestBusinessDatetimeIndex: + @pytest.fixture + def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + return bdate_range(START, END, freq=freq) + + def test_comparison(self, rng): + d = rng[10] + + comp = rng > d + assert comp[11] + assert not comp[9] + + def test_copy(self, rng): + cp = rng.copy() + tm.assert_index_equal(cp, rng) + + def test_identical(self, rng): + t1 = rng.copy() + t2 = rng.copy() + assert t1.identical(t2) + + # name + t1 = t1.rename("foo") + assert t1.equals(t2) + assert not t1.identical(t2) + t2 = t2.rename("foo") + assert t1.identical(t2) + + # freq + t2v = Index(t2.values) + assert t1.equals(t2v) + assert not t1.identical(t2v) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py new file mode 100644 index 0000000000000000000000000000000000000000..8b493fc61cb5873532e2e8393007533ee6cb8e4f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -0,0 +1,466 @@ +""" test partial slicing on Series/Frame """ + +from datetime import datetime + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestSlicing: + def test_string_index_series_name_converted(self): + # GH#1644 + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + index=date_range("1/1/2000", periods=10), + ) + + result = df.loc["1/3/2000"] + assert result.name == df.index[2] + + result = df.T["1/3/2000"] + assert result.name == df.index[2] + + def test_stringified_slice_with_tz(self): + # GH#2658 + start = "2013-01-07" + idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + df = DataFrame(np.arange(10), index=idx) + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + + def test_return_type_doesnt_depend_on_monotonicity(self): + # GH#24892 we get Series back regardless of whether our DTI is monotonic + dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) + ser = Series(range(3), index=dti) + + # non-monotonic index + ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) + + # key with resolution strictly lower than "min" + key = "2015-5-14 00" + + # monotonic increasing index + result = ser.loc[key] + expected = ser.iloc[1:] + tm.assert_series_equal(result, expected) + + # monotonic decreasing index + result = ser.iloc[::-1].loc[key] + expected = ser.iloc[::-1][:-1] + tm.assert_series_equal(result, expected) + + # non-monotonic index + result2 = ser2.loc[key] + expected2 = ser2.iloc[::2] + tm.assert_series_equal(result2, expected2) + + def test_return_type_doesnt_depend_on_monotonicity_higher_reso(self): + # GH#24892 we get Series back regardless of whether our DTI is monotonic + dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) + ser = Series(range(3), index=dti) + + # non-monotonic index + ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) + + # key with resolution strictly *higher) than "min" + key = "2015-5-14 00:00:00" + + # monotonic increasing index + result = ser.loc[key] + assert result == 1 + + # monotonic decreasing index + result = ser.iloc[::-1].loc[key] + assert result == 1 + + # non-monotonic index + result2 = ser2.loc[key] + assert result2 == 0 + + def test_monotone_DTI_indexing_bug(self): + # GH 19362 + # Testing accessing the first element in a monotonic descending + # partial string indexing. + + df = DataFrame(list(range(5))) + date_list = [ + "2018-01-02", + "2017-02-10", + "2016-03-10", + "2015-03-15", + "2014-03-16", + ] + date_index = DatetimeIndex(date_list) + df["date"] = date_index + expected = DataFrame({0: list(range(5)), "date": date_index}) + tm.assert_frame_equal(df, expected) + + # We get a slice because df.index's resolution is hourly and we + # are slicing with a daily-resolution string. If both were daily, + # we would get a single item back + dti = date_range("20170101 01:00:00", periods=3) + df = DataFrame({"A": [1, 2, 3]}, index=dti[::-1]) + + expected = DataFrame({"A": 1}, index=dti[-1:][::-1]) + result = df.loc["2017-01-03"] + tm.assert_frame_equal(result, expected) + + result2 = df.iloc[::-1].loc["2017-01-03"] + expected2 = expected.iloc[::-1] + tm.assert_frame_equal(result2, expected2) + + def test_slice_year(self): + dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s["2005"] + expected = s[s.index.year == 2005] + tm.assert_series_equal(result, expected) + + df = DataFrame(np.random.default_rng(2).random((len(dti), 5)), index=dti) + result = df.loc["2005"] + expected = df[df.index.year == 2005] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "partial_dtime", + [ + "2019", + "2019Q4", + "Dec 2019", + "2019-12-31", + "2019-12-31 23", + "2019-12-31 23:59", + ], + ) + def test_slice_end_of_period_resolution(self, partial_dtime): + # GH#31064 + dti = date_range("2019-12-31 23:59:55.999999999", periods=10, freq="s") + + ser = Series(range(10), index=dti) + result = ser[partial_dtime] + expected = ser.iloc[:5] + tm.assert_series_equal(result, expected) + + def test_slice_quarter(self): + dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + assert len(s["2001Q1"]) == 90 + + df = DataFrame(np.random.default_rng(2).random((len(dti), 5)), index=dti) + assert len(df.loc["1Q01"]) == 90 + + def test_slice_month(self): + dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + assert len(s["2005-11"]) == 30 + + df = DataFrame(np.random.default_rng(2).random((len(dti), 5)), index=dti) + assert len(df.loc["2005-11"]) == 30 + + tm.assert_series_equal(s["2005-11"], s["11-2005"]) + + def test_partial_slice(self): + rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-05":"2006-02"] + expected = s["20050501":"20060228"] + tm.assert_series_equal(result, expected) + + result = s["2005-05":] + expected = s["20050501":] + tm.assert_series_equal(result, expected) + + result = s[:"2006-02"] + expected = s[:"20060228"] + tm.assert_series_equal(result, expected) + + result = s["2005-1-1"] + assert result == s.iloc[0] + + with pytest.raises(KeyError, match=r"^'2004-12-31'$"): + s["2004-12-31"] + + def test_partial_slice_daily(self): + rng = date_range(freq="h", start=datetime(2005, 1, 31), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-31"] + tm.assert_series_equal(result, s.iloc[:24]) + + with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"): + s["2004-12-31 00"] + + def test_partial_slice_hourly(self): + rng = date_range(freq="min", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[: 60 * 4]) + + result = s["2005-1-1 20"] + tm.assert_series_equal(result, s.iloc[:60]) + + assert s["2005-1-1 20:00"] == s.iloc[0] + with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"): + s["2004-12-31 00:15"] + + def test_partial_slice_minutely(self): + rng = date_range(freq="s", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-1 23:59"] + tm.assert_series_equal(result, s.iloc[:60]) + + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[:60]) + + assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0] + with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"): + s["2004-12-31 00:00:00"] + + def test_partial_slice_second_precision(self): + rng = date_range( + start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), + periods=20, + freq="us", + ) + s = Series(np.arange(20), rng) + + tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10]) + + tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:]) + + assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0] + with pytest.raises(KeyError, match="2005-1-1 00:00:00"): + s["2005-1-1 00:00:00"] + + def test_partial_slicing_dataframe(self): + # GH14856 + # Test various combinations of string slicing resolution vs. + # index resolution + # - If string resolution is less precise than index resolution, + # string is considered a slice + # - If string resolution is equal to or more precise than index + # resolution, string is considered an exact match + formats = [ + "%Y", + "%Y-%m", + "%Y-%m-%d", + "%Y-%m-%d %H", + "%Y-%m-%d %H:%M", + "%Y-%m-%d %H:%M:%S", + ] + resolutions = ["year", "month", "day", "hour", "minute", "second"] + for rnum, resolution in enumerate(resolutions[2:], 2): + # we check only 'day', 'hour', 'minute' and 'second' + unit = Timedelta("1 " + resolution) + middate = datetime(2012, 1, 1, 0, 0, 0) + index = DatetimeIndex([middate - unit, middate, middate + unit]) + values = [1, 2, 3] + df = DataFrame({"a": values}, index, dtype=np.int64) + assert df.index.resolution == resolution + + # Timestamp with the same resolution as index + # Should be exact match for Series (return scalar) + # and raise KeyError for Frame + for timestamp, expected in zip(index, values): + ts_string = timestamp.strftime(formats[rnum]) + # make ts_string as precise as index + result = df["a"][ts_string] + assert isinstance(result, np.int64) + assert result == expected + msg = rf"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df[ts_string] + + # Timestamp with resolution less precise than index + for fmt in formats[:rnum]: + for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: + ts_string = index[element].strftime(fmt) + + # Series should return slice + result = df["a"][ts_string] + expected = df["a"][theslice] + tm.assert_series_equal(result, expected) + + # pre-2.0 df[ts_string] was overloaded to interpret this + # as slicing along index + with pytest.raises(KeyError, match=ts_string): + df[ts_string] + + # Timestamp with resolution more precise than index + # Compatible with existing key + # Should return scalar for Series + # and raise KeyError for Frame + for fmt in formats[rnum + 1 :]: + ts_string = index[1].strftime(fmt) + result = df["a"][ts_string] + assert isinstance(result, np.int64) + assert result == 2 + msg = rf"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df[ts_string] + + # Not compatible with existing key + # Should raise KeyError + for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: + ts = index[1] + Timedelta("1 " + res) + ts_string = ts.strftime(fmt) + msg = rf"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df["a"][ts_string] + with pytest.raises(KeyError, match=msg): + df[ts_string] + + def test_partial_slicing_with_multiindex(self): + # GH 4758 + # partial string indexing with a multi-index buggy + df = DataFrame( + { + "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], + "val": [1, 2, 3, 4], + }, + index=date_range("2013-06-19 09:30:00", periods=4, freq="5min"), + ) + df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) + + expected = DataFrame( + [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] + ) + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] + tm.assert_frame_equal(result, expected) + + expected = df_multi.loc[ + (Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") + ] + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] + tm.assert_series_equal(result, expected) + + # partial string indexing on first level, scalar indexing on the other two + result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")] + expected = df_multi.iloc[:1].droplevel([1, 2]) + tm.assert_frame_equal(result, expected) + + def test_partial_slicing_with_multiindex_series(self): + # GH 4294 + # partial slice on a series mi + ser = Series( + range(250), + index=MultiIndex.from_product( + [date_range("2000-1-1", periods=50), range(5)] + ), + ) + + s2 = ser[:-1].copy() + expected = s2["2000-1-4"] + result = s2[Timestamp("2000-1-4")] + tm.assert_series_equal(result, expected) + + result = ser[Timestamp("2000-1-4")] + expected = ser["2000-1-4"] + tm.assert_series_equal(result, expected) + + df2 = DataFrame(ser) + expected = df2.xs("2000-1-4") + result = df2.loc[Timestamp("2000-1-4")] + tm.assert_frame_equal(result, expected) + + def test_partial_slice_requires_monotonicity(self): + # Disallowed since 2.0 (GH 37819) + ser = Series(np.arange(10), date_range("2014-01-01", periods=10)) + + nonmonotonic = ser.iloc[[3, 5, 4]] + timestamp = Timestamp("2014-01-10") + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + nonmonotonic["2014-01-10":] + + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): + nonmonotonic[timestamp:] + + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + nonmonotonic.loc["2014-01-10":] + + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): + nonmonotonic.loc[timestamp:] + + def test_loc_datetime_length_one(self): + # GH16071 + df = DataFrame( + columns=["1"], + index=date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"), + ) + result = df.loc[datetime(2016, 10, 1) :] + tm.assert_frame_equal(result, df) + + result = df.loc["2016-10-01T00:00:00":] + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "start", + [ + "2018-12-02 21:50:00+00:00", + Timestamp("2018-12-02 21:50:00+00:00"), + Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(), + ], + ) + @pytest.mark.parametrize( + "end", + [ + "2018-12-02 21:52:00+00:00", + Timestamp("2018-12-02 21:52:00+00:00"), + Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(), + ], + ) + def test_getitem_with_datestring_with_UTC_offset(self, start, end): + # GH 24076 + idx = date_range( + start="2018-12-02 14:50:00-07:00", + end="2018-12-02 14:50:00-07:00", + freq="1min", + ) + df = DataFrame(1, index=idx, columns=["A"]) + result = df[start:end] + expected = df.iloc[0:3, :] + tm.assert_frame_equal(result, expected) + + # GH 16785 + start = str(start) + end = str(end) + with pytest.raises(ValueError, match="Both dates must"): + df[start : end[:-4] + "1:00"] + + with pytest.raises(ValueError, match="The index must be timezone"): + df = df.tz_localize(None) + df[start:end] + + def test_slice_reduce_to_series(self): + # GH 27516 + df = DataFrame( + {"A": range(24)}, index=date_range("2000", periods=24, freq="ME") + ) + expected = Series( + range(12), index=date_range("2000", periods=12, freq="ME"), name="A" + ) + result = df.loc["2000", "A"] + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_pickle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..922b4a18119f4d457de501225611f8884689d434 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_pickle.py @@ -0,0 +1,45 @@ +import pytest + +from pandas import ( + NaT, + date_range, + to_datetime, +) +import pandas._testing as tm + + +class TestPickle: + def test_pickle(self): + # GH#4606 + idx = to_datetime(["2013-01-01", NaT, "2014-01-06"]) + idx_p = tm.round_trip_pickle(idx) + assert idx_p[0] == idx[0] + assert idx_p[1] is NaT + assert idx_p[2] == idx[2] + + def test_pickle_dont_infer_freq(self): + # GH#11002 + # don't infer freq + idx = date_range("1750-1-1", "2050-1-1", freq="7D") + idx_p = tm.round_trip_pickle(idx) + tm.assert_index_equal(idx, idx_p) + + def test_pickle_after_set_freq(self): + dti = date_range("20130101", periods=3, tz="US/Eastern", name="foo") + dti = dti._with_freq(None) + + res = tm.round_trip_pickle(dti) + tm.assert_index_equal(res, dti) + + def test_roundtrip_pickle_with_tz(self): + # GH#8367 + # round-trip of timezone + index = date_range("20130101", periods=3, tz="US/Eastern", name="foo") + unpickled = tm.round_trip_pickle(index) + tm.assert_index_equal(index, unpickled) + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_pickle_unpickle(self, freq): + rng = date_range("2009-01-01", "2010-01-01", freq=freq) + unpickled = tm.round_trip_pickle(rng) + assert unpickled.freq == freq diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_reindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_reindex.py new file mode 100644 index 0000000000000000000000000000000000000000..e4911aa3c4a2938cedb70887b6bd3f28e408f8c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_reindex.py @@ -0,0 +1,56 @@ +from datetime import timedelta + +import numpy as np + +from pandas import ( + DatetimeIndex, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexReindex: + def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): + # GH#7774 + index = date_range("2013-01-01", periods=3, tz="US/Eastern") + assert str(index.reindex([])[0].tz) == "US/Eastern" + assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" + + def test_reindex_with_same_tz_nearest(self): + # GH#32740 + rng_a = date_range("2010-01-01", "2010-01-02", periods=24, tz="utc") + rng_b = date_range("2010-01-01", "2010-01-02", periods=23, tz="utc") + result1, result2 = rng_a.reindex( + rng_b, method="nearest", tolerance=timedelta(seconds=20) + ) + expected_list1 = [ + "2010-01-01 00:00:00", + "2010-01-01 01:05:27.272727272", + "2010-01-01 02:10:54.545454545", + "2010-01-01 03:16:21.818181818", + "2010-01-01 04:21:49.090909090", + "2010-01-01 05:27:16.363636363", + "2010-01-01 06:32:43.636363636", + "2010-01-01 07:38:10.909090909", + "2010-01-01 08:43:38.181818181", + "2010-01-01 09:49:05.454545454", + "2010-01-01 10:54:32.727272727", + "2010-01-01 12:00:00", + "2010-01-01 13:05:27.272727272", + "2010-01-01 14:10:54.545454545", + "2010-01-01 15:16:21.818181818", + "2010-01-01 16:21:49.090909090", + "2010-01-01 17:27:16.363636363", + "2010-01-01 18:32:43.636363636", + "2010-01-01 19:38:10.909090909", + "2010-01-01 20:43:38.181818181", + "2010-01-01 21:49:05.454545454", + "2010-01-01 22:54:32.727272727", + "2010-01-02 00:00:00", + ] + expected1 = DatetimeIndex( + expected_list1, dtype="datetime64[ns, UTC]", freq=None + ) + expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.dtype("intp")) + tm.assert_index_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..e93fc0e2a4e2e740e2dee27e332b68b060ba7aa7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -0,0 +1,329 @@ +""" +Tests for DatetimeIndex methods behaving like their Timestamp counterparts +""" + +import calendar +from datetime import ( + date, + datetime, + time, +) +import locale +import unicodedata + +import numpy as np +import pytest + +from pandas._libs.tslibs import timezones + +from pandas import ( + DatetimeIndex, + Index, + NaT, + Timestamp, + date_range, + offsets, +) +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray + + +class TestDatetimeIndexOps: + def test_dti_no_millisecond_field(self): + msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex.millisecond + + msg = "'DatetimeIndex' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex([]).millisecond + + def test_dti_time(self): + rng = date_range("1/1/2000", freq="12min", periods=10) + result = Index(rng).time + expected = [t.time() for t in rng] + assert (result == expected).all() + + def test_dti_date(self): + rng = date_range("1/1/2000", freq="12h", periods=10) + result = Index(rng).date + expected = [t.date() for t in rng] + assert (result == expected).all() + + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_dti_date2(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), NaT]) + + index = DatetimeIndex(["2018-06-04 10:00:00", NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_dti_time2(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + + def test_dti_timetz(self, tz_naive_fixture): + # GH#21358 + tz = timezones.maybe_get_tz(tz_naive_fixture) + + expected = np.array([time(10, 20, 30, tzinfo=tz), NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", NaT], tz=tz) + result = index.timetz + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "field", + [ + "dayofweek", + "day_of_week", + "dayofyear", + "day_of_year", + "quarter", + "days_in_month", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + ], + ) + def test_dti_timestamp_fields(self, field): + # extra fields from DatetimeIndex like quarter and week + idx = date_range("2020-01-01", periods=10) + expected = getattr(idx, field)[-1] + + result = getattr(Timestamp(idx[-1]), field) + assert result == expected + + def test_dti_nanosecond(self): + dti = DatetimeIndex(np.arange(10)) + expected = Index(np.arange(10, dtype=np.int32)) + + tm.assert_index_equal(dti.nanosecond, expected) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_hour_tzaware(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + assert (rng.hour == 0).all() + + # a more unusual time zone, GH#1946 + dr = date_range( + "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" + ) + + expected = Index(np.arange(10, dtype=np.int32)) + tm.assert_index_equal(dr.hour, expected) + + # GH#12806 + # error: Unsupported operand types for + ("List[None]" and "List[str]") + @pytest.mark.parametrize( + "time_locale", [None] + tm.get_locales() # type: ignore[operator] + ) + def test_day_name_month_name(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + # GH#11128 + dti = date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): + name = name.capitalize() + assert dti.day_name(locale=time_locale)[day] == name + assert dti.day_name(locale=None)[day] == eng_name + ts = Timestamp(datetime(2016, 4, day)) + assert ts.day_name(locale=time_locale) == name + dti = dti.append(DatetimeIndex([NaT])) + assert np.isnan(dti.day_name(locale=time_locale)[-1]) + ts = Timestamp(NaT) + assert np.isnan(ts.day_name(locale=time_locale)) + + # GH#12805 + dti = date_range(freq="ME", start="2012", end="2013") + result = dti.month_name(locale=time_locale) + expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes GH#22342 + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + + tm.assert_index_equal(result, expected) + + for item, expected in zip(dti, expected_months): + result = item.month_name(locale=time_locale) + expected = expected.capitalize() + + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected + dti = dti.append(DatetimeIndex([NaT])) + assert np.isnan(dti.month_name(locale=time_locale)[-1]) + + def test_dti_week(self): + # GH#6538: Check that DatetimeIndex and its TimeStamp elements + # return the same weekofyear accessor close to new year w/ tz + dates = ["2013/12/29", "2013/12/30", "2013/12/31"] + dates = DatetimeIndex(dates, tz="Europe/Brussels") + expected = [52, 1, 1] + assert dates.isocalendar().week.tolist() == expected + assert [d.weekofyear for d in dates] == expected + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_dti_fields(self, tz): + # GH#13303 + dti = date_range(freq="D", start=datetime(1998, 1, 1), periods=365, tz=tz) + assert dti.year[0] == 1998 + assert dti.month[0] == 1 + assert dti.day[0] == 1 + assert dti.hour[0] == 0 + assert dti.minute[0] == 0 + assert dti.second[0] == 0 + assert dti.microsecond[0] == 0 + assert dti.dayofweek[0] == 3 + + assert dti.dayofyear[0] == 1 + assert dti.dayofyear[120] == 121 + + assert dti.isocalendar().week.iloc[0] == 1 + assert dti.isocalendar().week.iloc[120] == 18 + + assert dti.quarter[0] == 1 + assert dti.quarter[120] == 2 + + assert dti.days_in_month[0] == 31 + assert dti.days_in_month[90] == 30 + + assert dti.is_month_start[0] + assert not dti.is_month_start[1] + assert dti.is_month_start[31] + assert dti.is_quarter_start[0] + assert dti.is_quarter_start[90] + assert dti.is_year_start[0] + assert not dti.is_year_start[364] + assert not dti.is_month_end[0] + assert dti.is_month_end[30] + assert not dti.is_month_end[31] + assert dti.is_month_end[364] + assert not dti.is_quarter_end[0] + assert not dti.is_quarter_end[30] + assert dti.is_quarter_end[89] + assert dti.is_quarter_end[364] + assert not dti.is_year_end[0] + assert dti.is_year_end[364] + + assert len(dti.year) == 365 + assert len(dti.month) == 365 + assert len(dti.day) == 365 + assert len(dti.hour) == 365 + assert len(dti.minute) == 365 + assert len(dti.second) == 365 + assert len(dti.microsecond) == 365 + assert len(dti.dayofweek) == 365 + assert len(dti.dayofyear) == 365 + assert len(dti.isocalendar()) == 365 + assert len(dti.quarter) == 365 + assert len(dti.is_month_start) == 365 + assert len(dti.is_month_end) == 365 + assert len(dti.is_quarter_start) == 365 + assert len(dti.is_quarter_end) == 365 + assert len(dti.is_year_start) == 365 + assert len(dti.is_year_end) == 365 + + dti.name = "name" + + # non boolean accessors -> return Index + for accessor in DatetimeArray._field_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, Index) + assert res.name == "name" + + # boolean accessors -> return array + for accessor in DatetimeArray._bool_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, np.ndarray) + + # test boolean indexing + res = dti[dti.is_quarter_start] + exp = dti[[0, 90, 181, 273]] + tm.assert_index_equal(res, exp) + res = dti[dti.is_leap_year] + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name").as_unit("ns") + tm.assert_index_equal(res, exp) + + def test_dti_is_year_quarter_start(self): + dti = date_range(freq="BQE-FEB", start=datetime(1998, 1, 1), periods=4) + + assert sum(dti.is_quarter_start) == 0 + assert sum(dti.is_quarter_end) == 4 + assert sum(dti.is_year_start) == 0 + assert sum(dti.is_year_end) == 1 + + def test_dti_is_month_start(self): + dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) + + assert dti.is_month_start[0] == 1 + + def test_dti_is_month_start_custom(self): + # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, + bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") + dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) + msg = "Custom business days is not supported by is_month_start" + with pytest.raises(ValueError, match=msg): + dti.is_month_start diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..fc3a1d4721841a052c19071883653a48c835c3b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_setops.py @@ -0,0 +1,666 @@ +from datetime import ( + datetime, + timezone, +) + +import numpy as np +import pytest +import pytz + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Series, + Timestamp, + bdate_range, + date_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import ( + BMonthEnd, + Minute, + MonthEnd, +) + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestDatetimeIndexSetOps: + tz = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Singapore", + "dateutil/US/Pacific", + ] + + # TODO: moved from test_datetimelike; dedup with version below + def test_union2(self, sort): + everything = date_range("2020-01-01", periods=10) + first = everything[:5] + second = everything[5:] + union = first.union(second, sort=sort) + tm.assert_index_equal(union, everything) + + @pytest.mark.parametrize("box", [np.array, Series, list]) + def test_union3(self, sort, box): + everything = date_range("2020-01-01", periods=10) + first = everything[:5] + second = everything[5:] + + # GH 10149 support listlike inputs other than Index objects + expected = first.union(second, sort=sort) + case = box(second.values) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", tz) + def test_union(self, tz, sort): + rng1 = date_range("1/1/2000", freq="D", periods=5, tz=tz) + other1 = date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = date_range("1/1/2000", freq="D", periods=10, tz=tz) + expected1_notsorted = DatetimeIndex(list(other1) + list(rng1)) + + rng2 = date_range("1/1/2000", freq="D", periods=5, tz=tz) + other2 = date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = date_range("1/1/2000", freq="D", periods=8, tz=tz) + expected2_notsorted = DatetimeIndex(list(other2) + list(rng2[:3])) + + rng3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) + other3 = DatetimeIndex([], tz=tz).as_unit("ns") + expected3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) + expected3_notsorted = rng3 + + for rng, other, exp, exp_notsorted in [ + (rng1, other1, expected1, expected1_notsorted), + (rng2, other2, expected2, expected2_notsorted), + (rng3, other3, expected3, expected3_notsorted), + ]: + result_union = rng.union(other, sort=sort) + tm.assert_index_equal(result_union, exp) + + result_union = other.union(rng, sort=sort) + if sort is None: + tm.assert_index_equal(result_union, exp) + else: + tm.assert_index_equal(result_union, exp_notsorted) + + def test_union_coverage(self, sort): + idx = DatetimeIndex(["2000-01-03", "2000-01-01", "2000-01-02"]) + ordered = DatetimeIndex(idx.sort_values(), freq="infer") + result = ordered.union(idx, sort=sort) + tm.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered, sort=sort) + tm.assert_index_equal(result, ordered) + assert result.freq == ordered.freq + + def test_union_bug_1730(self, sort): + rng_a = date_range("1/1/2012", periods=4, freq="3h") + rng_b = date_range("1/1/2012", periods=4, freq="4h") + + result = rng_a.union(rng_b, sort=sort) + exp = list(rng_a) + list(rng_b[1:]) + if sort is None: + exp = DatetimeIndex(sorted(exp)) + else: + exp = DatetimeIndex(exp) + tm.assert_index_equal(result, exp) + + def test_union_bug_1745(self, sort): + left = DatetimeIndex(["2012-05-11 15:19:49.695000"]) + right = DatetimeIndex( + [ + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) + + result = left.union(right, sort=sort) + exp = DatetimeIndex( + [ + "2012-05-11 15:19:49.695000", + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) + if sort is None: + exp = exp.sort_values() + tm.assert_index_equal(result, exp) + + def test_union_bug_4564(self, sort): + from pandas import DateOffset + + left = date_range("2013-01-01", "2013-02-01") + right = left + DateOffset(minutes=15) + + result = left.union(right, sort=sort) + exp = list(left) + list(right) + if sort is None: + exp = DatetimeIndex(sorted(exp)) + else: + exp = DatetimeIndex(exp) + tm.assert_index_equal(result, exp) + + def test_union_freq_both_none(self, sort): + # GH11086 + expected = bdate_range("20150101", periods=10) + expected._data.freq = None + + result = expected.union(expected, sort=sort) + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_union_freq_infer(self): + # When taking the union of two DatetimeIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # TimedeltaIndex behavior. + dti = date_range("2016-01-01", periods=5) + left = dti[[0, 1, 3, 4]] + right = dti[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, dti) + assert result.freq == "D" + + def test_union_dataframe_index(self): + rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") + s1 = Series(np.random.default_rng(2).standard_normal(len(rng1)), rng1) + + rng2 = date_range("1/1/1980", "12/1/2001", freq="MS") + s2 = Series(np.random.default_rng(2).standard_normal(len(rng2)), rng2) + df = DataFrame({"s1": s1, "s2": s2}) + + exp = date_range("1/1/1980", "1/1/2012", freq="MS") + tm.assert_index_equal(df.index, exp) + + def test_union_with_DatetimeIndex(self, sort): + i1 = Index(np.arange(0, 20, 2, dtype=np.int64)) + i2 = date_range(start="2012-01-03 00:00:00", periods=10, freq="D") + # Works + i1.union(i2, sort=sort) + # Fails with "AttributeError: can't set attribute" + i2.union(i1, sort=sort) + + def test_union_same_timezone_different_units(self): + # GH 55238 + idx1 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("ms") + idx2 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + result = idx1.union(idx2) + expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + tm.assert_index_equal(result, expected) + + # TODO: moved from test_datetimelike; de-duplicate with version below + def test_intersection2(self): + first = date_range("2020-01-01", periods=10) + second = first[5:] + intersect = first.intersection(second) + tm.assert_index_equal(intersect, second) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + tm.assert_index_equal(result, second) + + third = Index(["a", "b", "c"]) + result = first.intersection(third) + expected = Index([], dtype=object) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "tz", [None, "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + ) + def test_intersection(self, tz, sort): + # GH 4690 (with tz) + base = date_range("6/1/2000", "6/30/2000", freq="D", name="idx") + + # if target has the same name, it is preserved + rng2 = date_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = date_range("6/1/2000", "6/20/2000", freq="D", name="idx") + + # if target name is different, it will be reset + rng3 = date_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) + + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = DatetimeIndex([], freq="D", name="idx", dtype="M8[ns]") + + for rng, expected in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + # non-monotonic + base = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" + ).as_unit("ns") + + rng2 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" + ).as_unit("ns") + expected2 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name="idx" + ).as_unit("ns") + + rng3 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + tz=tz, + name="other", + ).as_unit("ns") + expected3 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name=None + ).as_unit("ns") + + # GH 7880 + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx").as_unit("ns") + assert expected4.freq is None + + for rng, expected in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + # parametrize over both anchored and non-anchored freqs, as they + # have different code paths + @pytest.mark.parametrize("freq", ["min", "B"]) + def test_intersection_empty(self, tz_aware_fixture, freq): + # empty same freq GH2129 + tz = tz_aware_fixture + rng = date_range("6/1/2000", "6/15/2000", freq=freq, tz=tz) + result = rng[0:0].intersection(rng) + assert len(result) == 0 + assert result.freq == rng.freq + + result = rng.intersection(rng[0:0]) + assert len(result) == 0 + assert result.freq == rng.freq + + # no overlap GH#33604 + check_freq = freq != "min" # We don't preserve freq on non-anchored offsets + result = rng[:3].intersection(rng[-3:]) + tm.assert_index_equal(result, rng[:0]) + if check_freq: + # We don't preserve freq on non-anchored offsets + assert result.freq == rng.freq + + # swapped left and right + result = rng[-3:].intersection(rng[:3]) + tm.assert_index_equal(result, rng[:0]) + if check_freq: + # We don't preserve freq on non-anchored offsets + assert result.freq == rng.freq + + def test_intersection_bug_1708(self): + from pandas import DateOffset + + index_1 = date_range("1/1/2012", periods=4, freq="12h") + index_2 = index_1 + DateOffset(hours=1) + + result = index_1.intersection(index_2) + assert len(result) == 0 + + @pytest.mark.parametrize("tz", tz) + def test_difference(self, tz, sort): + rng_dates = ["1/2/2000", "1/3/2000", "1/1/2000", "1/4/2000", "1/5/2000"] + + rng1 = DatetimeIndex(rng_dates, tz=tz) + other1 = date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = DatetimeIndex(rng_dates, tz=tz) + + rng2 = DatetimeIndex(rng_dates, tz=tz) + other2 = date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = DatetimeIndex(rng_dates[:3], tz=tz) + + rng3 = DatetimeIndex(rng_dates, tz=tz) + other3 = DatetimeIndex([], tz=tz) + expected3 = DatetimeIndex(rng_dates, tz=tz) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + ]: + result_diff = rng.difference(other, sort) + if sort is None and len(other): + # We dont sort (yet?) when empty GH#24959 + expected = expected.sort_values() + tm.assert_index_equal(result_diff, expected) + + def test_difference_freq(self, sort): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], dtype="M8[ns]", freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + # preserve frequency when the difference is a contiguous + # subset of the original range + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = DatetimeIndex(["20160920", "20160921"], dtype="M8[ns]", freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + def test_datetimeindex_diff(self, sort): + dti1 = date_range(freq="QE-JAN", start=datetime(1997, 12, 31), periods=100) + dti2 = date_range(freq="QE-JAN", start=datetime(1997, 12, 31), periods=98) + assert len(dti1.difference(dti2, sort)) == 2 + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) + def test_setops_preserve_freq(self, tz): + rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) + + result = rng[:50].union(rng[50:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[30:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[60:100]) + assert result.name == rng.name + assert result.freq is None + assert result.tz == rng.tz + + result = rng[:50].intersection(rng[25:75]) + assert result.name == rng.name + assert result.freqstr == "D" + assert result.tz == rng.tz + + nofreq = DatetimeIndex(list(rng[25:75]), name="other") + result = rng[:50].union(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].intersection(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + + def test_intersection_non_tick_no_fastpath(self): + # GH#42104 + dti = DatetimeIndex( + [ + "2018-12-31", + "2019-03-31", + "2019-06-30", + "2019-09-30", + "2019-12-31", + "2020-03-31", + ], + freq="QE-DEC", + ) + result = dti[::2].intersection(dti[1::2]) + expected = dti[:0] + tm.assert_index_equal(result, expected) + + def test_dti_intersection(self): + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + assert left.tz == rng.tz + result = left.intersection(right) + assert result.tz == left.tz + + # Note: not difference, as there is no symmetry requirement there + @pytest.mark.parametrize("setop", ["union", "intersection", "symmetric_difference"]) + def test_dti_setop_aware(self, setop): + # non-overlapping + # GH#39328 as of 2.0 we cast these to UTC instead of object + rng = date_range("2012-11-15 00:00:00", periods=6, freq="h", tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="h", tz="US/Eastern") + + result = getattr(rng, setop)(rng2) + + left = rng.tz_convert("UTC") + right = rng2.tz_convert("UTC") + expected = getattr(left, setop)(right) + tm.assert_index_equal(result, expected) + assert result.tz == left.tz + if len(result): + assert result[0].tz is timezone.utc + assert result[-1].tz is timezone.utc + + def test_dti_union_mixed(self): + # GH#21671 + rng = DatetimeIndex([Timestamp("2011-01-01"), pd.NaT]) + rng2 = DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") + result = rng.union(rng2) + expected = Index( + [ + Timestamp("2011-01-01"), + pd.NaT, + Timestamp("2012-01-01", tz="Asia/Tokyo"), + Timestamp("2012-01-02", tz="Asia/Tokyo"), + ], + dtype=object, + ) + tm.assert_index_equal(result, expected) + + +class TestBusinessDatetimeIndex: + def test_union(self, sort): + rng = bdate_range(START, END) + # overlapping + left = rng[:10] + right = rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = rng[:5] + right = rng[10:] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, Index) + + # non-overlapping, no gap + left = rng[:5] + right = rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # order does not matter + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + else: + expected = DatetimeIndex(list(right) + list(left)) + tm.assert_index_equal(right.union(left, sort=sort), expected) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = rng.union(rng, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + def test_union_not_cacheable(self, sort): + rng = date_range("1/1/2000", periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2, sort=sort) + if sort is None: + tm.assert_index_equal(the_union, rng) + else: + expected = DatetimeIndex(list(rng[10:]) + list(rng[:10])) + tm.assert_index_equal(the_union, expected) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2, sort=sort) + expected = rng[10:] + tm.assert_index_equal(the_union, expected) + + def test_intersection(self): + rng = date_range("1/1/2000", periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + tm.assert_index_equal(the_int, expected) + assert isinstance(the_int, DatetimeIndex) + assert the_int.freq == rng.freq + + the_int = rng1.intersection(rng2) + tm.assert_index_equal(the_int, expected) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]).as_unit("ns") + tm.assert_index_equal(the_int, expected) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range("11/30/2011", "12/31/2011") + b = bdate_range("12/10/2011", "12/20/2011") + result = a.intersection(b) + tm.assert_index_equal(result, b) + assert result.freq == b.freq + + def test_intersection_list(self): + # GH#35876 + # values is not an Index -> no name -> retain "a" + values = [Timestamp("2020-01-01"), Timestamp("2020-02-01")] + idx = DatetimeIndex(values, name="a") + res = idx.intersection(values) + tm.assert_index_equal(res, idx) + + def test_month_range_union_tz_pytz(self, sort): + tz = pytz.timezone("US/Eastern") + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) + + early_dr.union(late_dr, sort=sort) + + @td.skip_if_windows + def test_month_range_union_tz_dateutil(self, sort): + from pandas._libs.tslibs.timezones import dateutil_gettz + + tz = dateutil_gettz("US/Eastern") + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) + + early_dr.union(late_dr, sort=sort) + + @pytest.mark.parametrize("sort", [False, None]) + def test_intersection_duplicates(self, sort): + # GH#38196 + idx1 = Index( + [ + Timestamp("2019-12-13"), + Timestamp("2019-12-12"), + Timestamp("2019-12-12"), + ] + ) + result = idx1.intersection(idx1, sort=sort) + expected = Index([Timestamp("2019-12-13"), Timestamp("2019-12-12")]) + tm.assert_index_equal(result, expected) + + +class TestCustomDatetimeIndex: + def test_union(self, sort): + # overlapping + rng = bdate_range(START, END, freq="C") + left = rng[:10] + right = rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = rng[:5] + right = rng[10:] + + the_union = left.union(right, sort) + assert isinstance(the_union, Index) + + # non-overlapping, no gap + left = rng[:5] + right = rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # order does not matter + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = rng.union(rng, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range("11/30/2011", "12/31/2011", freq="C") + b = bdate_range("12/10/2011", "12/20/2011", freq="C") + result = a.intersection(b) + tm.assert_index_equal(result, b) + assert result.freq == b.freq + + @pytest.mark.parametrize( + "tz", [None, "UTC", "Europe/Berlin", pytz.FixedOffset(-60)] + ) + def test_intersection_dst_transition(self, tz): + # GH 46702: Europe/Berlin has DST transition + idx1 = date_range("2020-03-27", periods=5, freq="D", tz=tz) + idx2 = date_range("2020-03-30", periods=5, freq="D", tz=tz) + result = idx1.intersection(idx2) + expected = date_range("2020-03-30", periods=2, freq="D", tz=tz) + tm.assert_index_equal(result, expected) + + # GH#45863 same problem for union + index1 = date_range("2021-10-28", periods=3, freq="D", tz="Europe/London") + index2 = date_range("2021-10-30", periods=4, freq="D", tz="Europe/London") + result = index1.union(index2) + expected = date_range("2021-10-28", periods=6, freq="D", tz="Europe/London") + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_timezones.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_timezones.py new file mode 100644 index 0000000000000000000000000000000000000000..daa5b346eb4ec2034fb164be5c03f12b7d0b4dc6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/datetimes/test_timezones.py @@ -0,0 +1,251 @@ +""" +Tests for DatetimeIndex timezone-related methods +""" +from datetime import ( + datetime, + timedelta, + timezone, + tzinfo, +) + +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import ( + conversion, + timezones, +) + +import pandas as pd +from pandas import ( + DatetimeIndex, + Timestamp, + bdate_range, + date_range, + isna, + to_datetime, +) +import pandas._testing as tm + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name) -> None: + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + + +fixed_off_no_name = FixedOffset(-330, None) + + +class TestDatetimeIndexTimezones: + # ------------------------------------------------------------- + # Unsorted + + def test_dti_drop_dont_lose_tz(self): + # GH#2621 + ind = date_range("2012-12-01", periods=10, tz="utc") + ind = ind.drop(ind[-1]) + + assert ind.tz is not None + + def test_dti_tz_conversion_freq(self, tz_naive_fixture): + # GH25241 + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="h") + assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="min") + assert t4.tz_convert(tz="UTC").freq == t4.freq + + def test_drop_dst_boundary(self): + # see gh-18031 + tz = "Europe/Brussels" + freq = "15min" + + start = Timestamp("201710290100", tz=tz) + end = Timestamp("201710290300", tz=tz) + index = date_range(start=start, end=end, freq=freq) + + expected = DatetimeIndex( + [ + "201710290115", + "201710290130", + "201710290145", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290300", + ], + dtype="M8[ns, Europe/Brussels]", + freq=freq, + ambiguous=[ + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + ], + ) + result = index.drop(index[0]) + tm.assert_index_equal(result, expected) + + def test_date_range_localize(self, unit): + rng = date_range( + "3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex( + ["3/11/2012 03:00", "3/11/2012 04:00"], dtype=f"M8[{unit}, US/Eastern]" + ) + rng3 = date_range("3/11/2012 03:00", periods=15, freq="h", unit=unit) + rng3 = rng3.tz_localize("US/Eastern") + + tm.assert_index_equal(rng._with_freq(None), rng3) + + # DST transition time + val = rng[0] + exp = Timestamp("3/11/2012 03:00", tz="US/Eastern") + + assert val.hour == 3 + assert exp.hour == 3 + assert val == exp # same UTC value + tm.assert_index_equal(rng[:2], rng2) + + def test_date_range_localize2(self, unit): + # Right before the DST transition + rng = date_range( + "3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex( + ["3/11/2012 00:00", "3/11/2012 01:00"], + dtype=f"M8[{unit}, US/Eastern]", + freq="h", + ) + tm.assert_index_equal(rng, rng2) + exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") + assert exp.hour == 0 + assert rng[0] == exp + exp = Timestamp("3/11/2012 01:00", tz="US/Eastern") + assert exp.hour == 1 + assert rng[1] == exp + + rng = date_range( + "3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern", unit=unit + ) + assert rng[2].hour == 3 + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range("1/1/2000", periods=20, tz="UTC") + eastern_range = utc_range.tz_convert("US/Eastern") + berlin_range = utc_range.tz_convert("Europe/Berlin") + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + assert a == b + assert b == c + assert a == c + + assert (utc_range == eastern_range).all() + assert (utc_range == berlin_range).all() + assert (berlin_range == eastern_range).all() + + def test_dti_equals_with_tz(self): + left = date_range("1/1/2011", periods=100, freq="h", tz="utc") + right = date_range("1/1/2011", periods=100, freq="h", tz="US/Eastern") + + assert not left.equals(right) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_nat(self, tzstr): + idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) + + assert isna(idx[1]) + assert idx[0].tzinfo is not None + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_utc_box_timestamp_and_localize(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") + rng_eastern = rng.tz_convert(tzstr) + + expected = rng[-1].astimezone(tz) + + stamp = rng_eastern[-1] + assert stamp == expected + assert stamp.tzinfo == expected.tzinfo + + # right tzinfo + rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") + rng_eastern = rng.tz_convert(tzstr) + # test not valid for dateutil timezones. + # assert 'EDT' in repr(rng_eastern[0].tzinfo) + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + def test_with_tz(self, tz): + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) + assert dr.tz is pytz.utc + + # DateRange with naive datetimes + dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) + + # normalized + central = dr.tz_convert(tz) + assert central.tz is tz + naive = central[0].to_pydatetime().replace(tzinfo=None) + comp = conversion.localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # compare vs a localized tz + naive = dr[0].to_pydatetime().replace(tzinfo=None) + comp = conversion.localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # datetimes with tzinfo set + dr = bdate_range( + datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + ) + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(Exception, match=msg): + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_convert_tz_aware_datetime_datetime(self, tz): + # GH#1581 + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] + + dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] + result = DatetimeIndex(dates_aware).as_unit("ns") + assert timezones.tz_compare(result.tz, tz) + + converted = to_datetime(dates_aware, utc=True).as_unit("ns") + ex_vals = np.array([Timestamp(x).as_unit("ns")._value for x in dates_aware]) + tm.assert_numpy_array_equal(converted.asi8, ex_vals) + assert converted.tz is timezone.utc diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..978949e8b140a6f614b1d8163a0fbaf37e6a2486 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d494ae53a5add42e5f022d2dfa41554060dbc1d7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc4aafbb05ba933ce24e2652e158636fda5dd2a7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_equals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_equals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53985ffd3f5cc8d2db99e2648b9610d55a20afc0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_equals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a65693b5faef17bdc140ecfb2d8ea18cf9ee2aa0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..082a7b6eded2c7f4891866bd549f95421514d6c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ac94cfb598cadfc9e31a85e83f0ab89aa3eabfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_range.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_range.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77ecb9e0d398d12c33d748ce20296097d71bdd76 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_range.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_tree.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_tree.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..654e60024788993827db32cdc6ed6f5e6d8cbee8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_interval_tree.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..065a014794d22db262bbdd08afa32354606d3d47 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2326d70ac48597a07511f051a945817d2425069c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1c74c96e1996294c352cc18443cd0b4964a4d6c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..59c555b9644a1230dc60d622fb5fdb80a1743afe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_astype.py @@ -0,0 +1,248 @@ +import re + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + IntervalDtype, +) + +from pandas import ( + CategoricalIndex, + Index, + IntervalIndex, + NaT, + Timedelta, + Timestamp, + interval_range, +) +import pandas._testing as tm + + +class AstypeTests: + """Tests common to IntervalIndex with any subtype""" + + def test_astype_idempotent(self, index): + result = index.astype("interval") + tm.assert_index_equal(result, index) + + result = index.astype(index.dtype) + tm.assert_index_equal(result, index) + + def test_astype_object(self, index): + result = index.astype(object) + expected = Index(index.values, dtype="object") + tm.assert_index_equal(result, expected) + assert not result.equals(index) + + def test_astype_category(self, index): + result = index.astype("category") + expected = CategoricalIndex(index.values) + tm.assert_index_equal(result, expected) + + result = index.astype(CategoricalDtype()) + tm.assert_index_equal(result, expected) + + # non-default params + categories = index.dropna().unique().values[:-1] + dtype = CategoricalDtype(categories=categories, ordered=True) + result = index.astype(dtype) + expected = CategoricalIndex(index.values, categories=categories, ordered=True) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "uint64", + "float64", + "complex128", + "period[M]", + "timedelta64", + "timedelta64[ns]", + "datetime64", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) + def test_astype_cannot_cast(self, index, dtype): + msg = "Cannot cast IntervalIndex to dtype" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + def test_astype_invalid_dtype(self, index): + msg = "data type [\"']fake_dtype[\"'] not understood" + with pytest.raises(TypeError, match=msg): + index.astype("fake_dtype") + + +class TestIntSubtype(AstypeTests): + """Tests specific to IntervalIndex with integer-like subtype""" + + indexes = [ + IntervalIndex.from_breaks(np.arange(-10, 11, dtype="int64")), + IntervalIndex.from_breaks(np.arange(100, dtype="uint64"), closed="left"), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize( + "subtype", ["float64", "datetime64[ns]", "timedelta64[ns]"] + ) + def test_subtype_conversion(self, index, subtype): + dtype = IntervalDtype(subtype, index.closed) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "subtype_start, subtype_end", [("int64", "uint64"), ("uint64", "int64")] + ) + def test_subtype_integer(self, subtype_start, subtype_end): + index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) + dtype = IntervalDtype(subtype_end, index.closed) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype_end), + index.right.astype(subtype_end), + closed=index.closed, + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.xfail(reason="GH#15832") + def test_subtype_integer_errors(self): + # int64 -> uint64 fails with negative values + index = interval_range(-10, 10) + dtype = IntervalDtype("uint64", "right") + + # Until we decide what the exception message _should_ be, we + # assert something that it should _not_ be. + # We should _not_ be getting a message suggesting that the -10 + # has been wrapped around to a large-positive integer + msg = "^(?!(left side of interval must be <= right side))" + with pytest.raises(ValueError, match=msg): + index.astype(dtype) + + +class TestFloatSubtype(AstypeTests): + """Tests specific to IntervalIndex with float subtype""" + + indexes = [ + interval_range(-10.0, 10.0, closed="neither"), + IntervalIndex.from_arrays( + [-1.5, np.nan, 0.0, 0.0, 1.5], [-0.5, np.nan, 1.0, 1.0, 3.0], closed="both" + ), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) + def test_subtype_integer(self, subtype): + index = interval_range(0.0, 10.0) + dtype = IntervalDtype(subtype, "right") + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + # raises with NA + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + index.insert(0, np.nan).astype(dtype) + + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) + def test_subtype_integer_with_non_integer_borders(self, subtype): + index = interval_range(0.0, 3.0, freq=0.25) + dtype = IntervalDtype(subtype, "right") + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + def test_subtype_integer_errors(self): + # float64 -> uint64 fails with negative values + index = interval_range(-10.0, 10.0) + dtype = IntervalDtype("uint64", "right") + msg = re.escape( + "Cannot convert interval[float64, right] to interval[uint64, right]; " + "subtypes are incompatible" + ) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_subtype_datetimelike(self, index, subtype): + dtype = IntervalDtype(subtype, "right") + msg = "Cannot convert .* to .*; subtypes are incompatible" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + +class TestDatetimelikeSubtype(AstypeTests): + """Tests specific to IntervalIndex with datetime-like subtype""" + + indexes = [ + interval_range(Timestamp("2018-01-01"), periods=10, closed="neither"), + interval_range(Timestamp("2018-01-01"), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01", tz="US/Eastern"), periods=10), + interval_range(Timedelta("0 days"), periods=10, closed="both"), + interval_range(Timedelta("0 days"), periods=10).insert(2, NaT), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) + def test_subtype_integer(self, index, subtype): + dtype = IntervalDtype(subtype, "right") + + if subtype != "int64": + msg = ( + r"Cannot convert interval\[(timedelta64|datetime64)\[ns.*\], .*\] " + r"to interval\[uint64, .*\]" + ) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + return + + result = index.astype(dtype) + new_left = index.left.astype(subtype) + new_right = index.right.astype(subtype) + + expected = IntervalIndex.from_arrays(new_left, new_right, closed=index.closed) + tm.assert_index_equal(result, expected) + + def test_subtype_float(self, index): + dtype = IntervalDtype("float64", "right") + msg = "Cannot convert .* to .*; subtypes are incompatible" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + def test_subtype_datetimelike(self): + # datetime -> timedelta raises + dtype = IntervalDtype("timedelta64[ns]", "right") + msg = "Cannot convert .* to .*; subtypes are incompatible" + + index = interval_range(Timestamp("2018-01-01"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + index = interval_range(Timestamp("2018-01-01", tz="CET"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + # timedelta -> datetime raises + dtype = IntervalDtype("datetime64[ns]", "right") + index = interval_range(Timedelta("0 days"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..e47a014f18045ae20fe27805a31b819b4ad229b9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_constructors.py @@ -0,0 +1,535 @@ +from functools import partial + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_unsigned_integer_dtype +from pandas.core.dtypes.dtypes import IntervalDtype + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + Index, + Interval, + IntervalIndex, + date_range, + notna, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray +import pandas.core.common as com + + +@pytest.fixture(params=[None, "foo"]) +def name(request): + return request.param + + +class ConstructorTests: + """ + Common tests for all variations of IntervalIndex construction. Input data + to be supplied in breaks format, then converted by the subclass method + get_kwargs_from_breaks to the expected format. + """ + + @pytest.fixture( + params=[ + ([3, 14, 15, 92, 653], np.int64), + (np.arange(10, dtype="int64"), np.int64), + (Index(np.arange(-10, 11, dtype=np.int64)), np.int64), + (Index(np.arange(10, 31, dtype=np.uint64)), np.uint64), + (Index(np.arange(20, 30, 0.5), dtype=np.float64), np.float64), + (date_range("20180101", periods=10), " Interval(0.5, 1.5) + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index + expected = np.array([True, True]) + tm.assert_numpy_array_equal(actual, expected) + actual = self.index <= self.index + tm.assert_numpy_array_equal(actual, expected) + actual = self.index >= self.index + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index < self.index + expected = np.array([False, False]) + tm.assert_numpy_array_equal(actual, expected) + actual = self.index > self.index + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left") + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index.values + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index.values == self.index + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index <= self.index.values + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index != self.index.values + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index > self.index.values + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index.values > self.index + tm.assert_numpy_array_equal(actual, np.array([False, False])) + + # invalid comparisons + actual = self.index == 0 + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index == self.index.left + tm.assert_numpy_array_equal(actual, np.array([False, False])) + + msg = "|".join( + [ + "not supported between instances of 'int' and '.*.Interval'", + r"Invalid comparison between dtype=interval\[int64, right\] and ", + ] + ) + with pytest.raises(TypeError, match=msg): + self.index > 0 + with pytest.raises(TypeError, match=msg): + self.index <= 0 + with pytest.raises(TypeError, match=msg): + self.index > np.arange(2) + + msg = "Lengths must match to compare" + with pytest.raises(ValueError, match=msg): + self.index > np.arange(3) + + def test_missing_values(self, closed): + idx = Index( + [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)] + ) + idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) + assert idx.equals(idx2) + + msg = ( + "missing values must be missing in the same location both left " + "and right sides" + ) + with pytest.raises(ValueError, match=msg): + IntervalIndex.from_arrays( + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed + ) + + tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) + + def test_sort_values(self, closed): + index = self.create_index(closed=closed) + + result = index.sort_values() + tm.assert_index_equal(result, index) + + result = index.sort_values(ascending=False) + tm.assert_index_equal(result, index[::-1]) + + # with nan + index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) + + result = index.sort_values() + expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) + tm.assert_index_equal(result, expected) + + result = index.sort_values(ascending=False, na_position="first") + expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_datetime(self, tz): + start = Timestamp("2000-01-01", tz=tz) + dates = date_range(start=start, periods=10) + index = IntervalIndex.from_breaks(dates) + + # test mid + start = Timestamp("2000-01-01T12:00", tz=tz) + expected = date_range(start=start, periods=9) + tm.assert_index_equal(index.mid, expected) + + # __contains__ doesn't check individual points + assert Timestamp("2000-01-01", tz=tz) not in index + assert Timestamp("2000-01-01T12", tz=tz) not in index + assert Timestamp("2000-01-02", tz=tz) not in index + iv_true = Interval( + Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz) + ) + iv_false = Interval( + Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz) + ) + assert iv_true in index + assert iv_false not in index + + # .contains does check individual points + assert not index.contains(Timestamp("2000-01-01", tz=tz)).any() + assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any() + assert index.contains(Timestamp("2000-01-02", tz=tz)).any() + + # test get_indexer + start = Timestamp("1999-12-31T12:00", tz=tz) + target = date_range(start=start, periods=7, freq="12h") + actual = index.get_indexer(target) + expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") + tm.assert_numpy_array_equal(actual, expected) + + start = Timestamp("2000-01-08T18:00", tz=tz) + target = date_range(start=start, periods=7, freq="6h") + actual = index.get_indexer(target) + expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") + tm.assert_numpy_array_equal(actual, expected) + + def test_append(self, closed): + index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) + + result = index1.append(index2) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) + tm.assert_index_equal(result, expected) + + result = index1.append([index1, index2]) + expected = IntervalIndex.from_arrays( + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed + ) + tm.assert_index_equal(result, expected) + + for other_closed in {"left", "right", "both", "neither"} - {closed}: + index_other_closed = IntervalIndex.from_arrays( + [0, 1], [1, 2], closed=other_closed + ) + result = index1.append(index_other_closed) + expected = index1.astype(object).append(index_other_closed.astype(object)) + tm.assert_index_equal(result, expected) + + def test_is_non_overlapping_monotonic(self, closed): + # Should be True in all cases + tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is True + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is True + + # Should be False in all cases (overlapping) + tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is False + + # Should be False in all cases (non-monotonic) + tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is False + + # Should be False for closed='both', otherwise True (GH16560) + if closed == "both": + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is False + else: + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is True + + @pytest.mark.parametrize( + "start, shift, na_value", + [ + (0, 1, np.nan), + (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT), + (Timedelta("0 days"), Timedelta("1 day"), pd.NaT), + ], + ) + def test_is_overlapping(self, start, shift, na_value, closed): + # GH 23309 + # see test_interval_tree.py for extensive tests; interface tests here + + # non-overlapping + tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # non-overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # overlapping + tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # common endpoints + tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + expected = closed == "both" + assert result is expected + + # common endpoints with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + assert result is expected + + # intervals with duplicate left values + a = [10, 15, 20, 25, 30, 35, 40, 45, 45, 50, 55, 60, 65, 70, 75, 80, 85] + b = [15, 20, 25, 30, 35, 40, 45, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90] + index = IntervalIndex.from_arrays(a, b, closed="right") + result = index.is_overlapping + assert result is False + + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))), + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ), + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ), + ], + ) + def test_to_tuples(self, tuples): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples() + expected = Index(com.asarray_tuplesafe(tuples)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))) + [np.nan], + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ) + + [np.nan], + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ) + + [np.nan], + ], + ) + @pytest.mark.parametrize("na_tuple", [True, False]) + def test_to_tuples_na(self, tuples, na_tuple): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples(na_tuple=na_tuple) + + # check the non-NA portion + expected_notna = Index(com.asarray_tuplesafe(tuples[:-1])) + result_notna = result[:-1] + tm.assert_index_equal(result_notna, expected_notna) + + # check the NA portion + result_na = result[-1] + if na_tuple: + assert isinstance(result_na, tuple) + assert len(result_na) == 2 + assert all(isna(x) for x in result_na) + else: + assert isna(result_na) + + def test_nbytes(self): + # GH 19209 + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") + + result = IntervalIndex.from_arrays(left, right).nbytes + expected = 64 # 4 * 8 * 2 + assert result == expected + + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) + def test_set_closed(self, name, closed, new_closed): + # GH 21670 + index = interval_range(0, 5, closed=closed, name=name) + result = index.set_closed(new_closed) + expected = interval_range(0, 5, closed=new_closed, name=name) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False]) + def test_set_closed_errors(self, bad_closed): + # GH 21670 + index = interval_range(0, 5) + msg = f"invalid option for 'closed': {bad_closed}" + with pytest.raises(ValueError, match=msg): + index.set_closed(bad_closed) + + def test_is_all_dates(self): + # GH 23576 + year_2017 = Interval( + Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00") + ) + year_2017_index = IntervalIndex([year_2017]) + assert not year_2017_index._is_all_dates + + +def test_dir(): + # GH#27571 dir(interval_index) should not raise + index = IntervalIndex.from_arrays([0, 1], [1, 2]) + result = dir(index) + assert "str" not in result + + +def test_searchsorted_different_argument_classes(listlike_box): + # https://github.com/pandas-dev/pandas/issues/32762 + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + result = values.searchsorted(listlike_box(values)) + expected = np.array([0, 1], dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = values._data.searchsorted(listlike_box(values)) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2] +) +def test_searchsorted_invalid_argument(arg): + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + msg = "'<' not supported between instances of 'pandas._libs.interval.Interval' and " + with pytest.raises(TypeError, match=msg): + values.searchsorted(arg) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_range.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_range.py new file mode 100644 index 0000000000000000000000000000000000000000..e8de59f84bcc6d6cece2768f942b4599d3ce1a2d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_range.py @@ -0,0 +1,369 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +from pandas import ( + DateOffset, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + interval_range, + timedelta_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import Day + + +@pytest.fixture(params=[None, "foo"]) +def name(request): + return request.param + + +class TestIntervalRange: + @pytest.mark.parametrize("freq, periods", [(1, 100), (2.5, 40), (5, 20), (25, 4)]) + def test_constructor_numeric(self, closed, name, freq, periods): + start, end = 0, 100 + breaks = np.arange(101, step=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + @pytest.mark.parametrize( + "freq, periods", [("D", 364), ("2D", 182), ("22D18h", 16), ("ME", 11)] + ) + def test_constructor_timestamp(self, closed, name, freq, periods, tz): + start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) + breaks = date_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + if not breaks.freq.n == 1 and tz is None: + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, periods", [("D", 100), ("2D12h", 40), ("5D", 20), ("25D", 4)] + ) + def test_constructor_timedelta(self, closed, name, freq, periods): + start, end = Timedelta("0 days"), Timedelta("100 days") + breaks = timedelta_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start, end, freq, expected_endpoint", + [ + (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), + (Timedelta("0D"), Timedelta("10D"), "2D4h", Timedelta("8D16h")), + ( + Timestamp("2018-01-01"), + Timestamp("2018-02-09"), + "MS", + Timestamp("2018-02-01"), + ), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-20", tz="US/Eastern"), + "5D12h", + Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), + ), + ], + ) + def test_early_truncation(self, start, end, freq, expected_endpoint): + # index truncates early if freq causes end to be skipped + result = interval_range(start=start, end=end, freq=freq) + result_endpoint = result.right[-1] + assert result_endpoint == expected_endpoint + + @pytest.mark.parametrize( + "start, end, freq", + [(0.5, None, None), (None, 4.5, None), (0.5, None, 1.5), (None, 6.5, 1.5)], + ) + def test_no_invalid_float_truncation(self, start, end, freq): + # GH 21161 + if freq is None: + breaks = [0.5, 1.5, 2.5, 3.5, 4.5] + else: + breaks = [0.5, 2.0, 3.5, 5.0, 6.5] + expected = IntervalIndex.from_breaks(breaks) + + result = interval_range(start=start, end=end, periods=4, freq=freq) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start, mid, end", + [ + ( + Timestamp("2018-03-10", tz="US/Eastern"), + Timestamp("2018-03-10 23:30:00", tz="US/Eastern"), + Timestamp("2018-03-12", tz="US/Eastern"), + ), + ( + Timestamp("2018-11-03", tz="US/Eastern"), + Timestamp("2018-11-04 00:30:00", tz="US/Eastern"), + Timestamp("2018-11-05", tz="US/Eastern"), + ), + ], + ) + def test_linspace_dst_transition(self, start, mid, end): + # GH 20976: linspace behavior defined from start/end/periods + # accounts for the hour gained/lost during DST transition + start = start.as_unit("ns") + mid = mid.as_unit("ns") + end = end.as_unit("ns") + result = interval_range(start=start, end=end, periods=2) + expected = IntervalIndex.from_breaks([start, mid, end]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq", [2, 2.0]) + @pytest.mark.parametrize("end", [10, 10.0]) + @pytest.mark.parametrize("start", [0, 0.0]) + def test_float_subtype(self, start, end, freq): + # Has float subtype if any of start/end/freq are float, even if all + # resulting endpoints can safely be upcast to integers + + # defined from start/end/freq + index = interval_range(start=start, end=end, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(start + end + freq) else "float64" + assert result == expected + + # defined from start/periods/freq + index = interval_range(start=start, periods=5, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(start + freq) else "float64" + assert result == expected + + # defined from end/periods/freq + index = interval_range(end=end, periods=5, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(end + freq) else "float64" + assert result == expected + + # GH 20976: linspace behavior defined from start/end/periods + index = interval_range(start=start, end=end, periods=5) + result = index.dtype.subtype + expected = "int64" if is_integer(start + end) else "float64" + assert result == expected + + def test_interval_range_fractional_period(self): + # float value for periods + expected = interval_range(start=0, periods=10) + msg = "Non-integer 'periods' in pd.date_range, .* pd.interval_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = interval_range(start=0, periods=10.5) + tm.assert_index_equal(result, expected) + + def test_constructor_coverage(self): + # equivalent timestamp-like start/end + start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") + expected = interval_range(start=start, end=end) + + result = interval_range(start=start.to_pydatetime(), end=end.to_pydatetime()) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timestamp + equiv_freq = [ + "D", + Day(), + Timedelta(days=1), + timedelta(days=1), + DateOffset(days=1), + ] + for freq in equiv_freq: + result = interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + + # equivalent timedelta-like start/end + start, end = Timedelta(days=1), Timedelta(days=10) + expected = interval_range(start=start, end=end) + + result = interval_range(start=start.to_pytimedelta(), end=end.to_pytimedelta()) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timedelta + equiv_freq = ["D", Day(), Timedelta(days=1), timedelta(days=1)] + for freq in equiv_freq: + result = interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) + + with pytest.raises(ValueError, match=msg): + interval_range(start=0) + + with pytest.raises(ValueError, match=msg): + interval_range(end=5) + + with pytest.raises(ValueError, match=msg): + interval_range(periods=2) + + with pytest.raises(ValueError, match=msg): + interval_range() + + # too many params + with pytest.raises(ValueError, match=msg): + interval_range(start=0, end=5, periods=6, freq=1.5) + + # mixed units + msg = "start, end, freq need to be type compatible" + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=Timestamp("20130101"), freq=2) + + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=Timedelta("1 day"), freq=2) + + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timestamp("20130101"), end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timestamp("20130101"), end=Timedelta("1 day"), freq="D" + ) + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timestamp("20130101"), end=Timestamp("20130110"), freq=2 + ) + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timedelta("1 day"), end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timedelta("1 day"), end=Timestamp("20130110"), freq="D" + ) + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timedelta("1 day"), end=Timedelta("10 days"), freq=2) + + # invalid periods + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + interval_range(start=0, periods="foo") + + # invalid start + msg = "start must be numeric or datetime-like, got foo" + with pytest.raises(ValueError, match=msg): + interval_range(start="foo", periods=10) + + # invalid end + msg = r"end must be numeric or datetime-like, got \(0, 1\]" + with pytest.raises(ValueError, match=msg): + interval_range(end=Interval(0, 1), periods=10) + + # invalid freq for datetime-like + msg = "freq must be numeric or convertible to DateOffset, got foo" + with pytest.raises(ValueError, match=msg): + interval_range(start=0, end=10, freq="foo") + + with pytest.raises(ValueError, match=msg): + interval_range(start=Timestamp("20130101"), periods=10, freq="foo") + + with pytest.raises(ValueError, match=msg): + interval_range(end=Timedelta("1 day"), periods=10, freq="foo") + + # mixed tz + start = Timestamp("2017-01-01", tz="US/Eastern") + end = Timestamp("2017-01-07", tz="US/Pacific") + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(TypeError, match=msg): + interval_range(start=start, end=end) + + def test_float_freq(self): + # GH 54477 + result = interval_range(0, 1, freq=0.1) + expected = IntervalIndex.from_breaks([0 + 0.1 * n for n in range(11)]) + tm.assert_index_equal(result, expected) + + result = interval_range(0, 1, freq=0.6) + expected = IntervalIndex.from_breaks([0, 0.6]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_tree.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..78388e84fc6dc1af7dadd78b88a1155ed8cfd812 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_interval_tree.py @@ -0,0 +1,208 @@ +from itertools import permutations + +import numpy as np +import pytest + +from pandas._libs.interval import IntervalTree +from pandas.compat import IS64 + +import pandas._testing as tm + + +def skipif_32bit(param): + """ + Skip parameters in a parametrize on 32bit systems. Specifically used + here to skip leaf_size parameters related to GH 23440. + """ + marks = pytest.mark.skipif(not IS64, reason="GH 23440: int type mismatch on 32bit") + return pytest.param(param, marks=marks) + + +@pytest.fixture(params=["int64", "float64", "uint64"]) +def dtype(request): + return request.param + + +@pytest.fixture(params=[skipif_32bit(1), skipif_32bit(2), 10]) +def leaf_size(request): + """ + Fixture to specify IntervalTree leaf_size parameter; to be used with the + tree fixture. + """ + return request.param + + +@pytest.fixture( + params=[ + np.arange(5, dtype="int64"), + np.arange(5, dtype="uint64"), + np.arange(5, dtype="float64"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), + ] +) +def tree(request, leaf_size): + left = request.param + return IntervalTree(left, left + 2, leaf_size=leaf_size) + + +class TestIntervalTree: + def test_get_indexer(self, tree): + result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) + expected = np.array([0, 4, -1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): + tree.get_indexer(np.array([3.0])) + + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2**63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_non_unique(self, tree): + indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) + + result = indexer[:1] + expected = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = np.sort(indexer[1:3]) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = np.sort(indexer[3:]) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = missing + expected = np.array([2], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2**63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + + def test_duplicates(self, dtype): + left = np.array([0, 0, 0], dtype=dtype) + tree = IntervalTree(left, left + 1) + + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + result = np.sort(indexer) + expected = np.array([0, 1, 2], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = missing + expected = np.array([], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] + ) + def test_get_indexer_closed(self, closed, leaf_size): + x = np.arange(1000, dtype="float64") + found = x.astype("intp") + not_found = (-1 * np.ones(1000)).astype("intp") + + tree = IntervalTree(x, x + 0.5, closed=closed, leaf_size=leaf_size) + tm.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) + + @pytest.mark.parametrize( + "left, right, expected", + [ + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), + (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), + (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), + ], + ) + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) + def test_is_overlapping(self, closed, order, left, right, expected): + # GH 23309 + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + assert result is expected + + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) + def test_is_overlapping_endpoints(self, closed, order): + """shared endpoints are marked as overlapping""" + # GH 23309 + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + expected = closed == "both" + assert result is expected + + @pytest.mark.parametrize( + "left, right", + [ + (np.array([], dtype="int64"), np.array([], dtype="int64")), + (np.array([0], dtype="int64"), np.array([1], dtype="int64")), + (np.array([np.nan]), np.array([np.nan])), + (np.array([np.nan] * 3), np.array([np.nan] * 3)), + ], + ) + def test_is_overlapping_trivial(self, closed, left, right): + # GH 23309 + tree = IntervalTree(left, right, closed=closed) + assert tree.is_overlapping is False + + @pytest.mark.skipif(not IS64, reason="GH 23440") + def test_construction_overflow(self): + # GH 25485 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 + tree = IntervalTree(left, right) + + # pivot should be average of left/right medians + result = tree.root.pivot + expected = (50 + np.iinfo(np.int64).max) / 2 + assert result == expected + + @pytest.mark.parametrize( + "left, right, expected", + [ + ([-np.inf, 1.0], [1.0, 2.0], 0.0), + ([-np.inf, -2.0], [-2.0, -1.0], -2.0), + ([-2.0, -1.0], [-1.0, np.inf], 0.0), + ([1.0, 2.0], [2.0, np.inf], 2.0), + ], + ) + def test_inf_bound_infinite_recursion(self, left, right, expected): + # GH 46658 + + tree = IntervalTree(left * 101, right * 101) + + result = tree.root.pivot + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..2f42c530a66868fa69b1d449e75f84d42592bb77 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_join.py @@ -0,0 +1,44 @@ +import pytest + +from pandas import ( + IntervalIndex, + MultiIndex, + RangeIndex, +) +import pandas._testing as tm + + +@pytest.fixture +def range_index(): + return RangeIndex(3, name="range_index") + + +@pytest.fixture +def interval_index(): + return IntervalIndex.from_tuples( + [(0.0, 1.0), (1.0, 2.0), (1.5, 2.5)], name="interval_index" + ) + + +def test_join_overlapping_in_mi_to_same_intervalindex(range_index, interval_index): + # GH-45661 + multi_index = MultiIndex.from_product([interval_index, range_index]) + result = multi_index.join(interval_index) + + tm.assert_index_equal(result, multi_index) + + +def test_join_overlapping_to_multiindex_with_same_interval(range_index, interval_index): + # GH-45661 + multi_index = MultiIndex.from_product([interval_index, range_index]) + result = interval_index.join(multi_index) + + tm.assert_index_equal(result, multi_index) + + +def test_join_overlapping_interval_to_another_intervalindex(interval_index): + # GH-45661 + flipped_interval_index = interval_index[::-1] + result = interval_index.join(flipped_interval_index) + + tm.assert_index_equal(result, interval_index) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_pickle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..308a90e72eab5db55f300341212d2c04e82c6900 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_pickle.py @@ -0,0 +1,13 @@ +import pytest + +from pandas import IntervalIndex +import pandas._testing as tm + + +class TestPickle: + @pytest.mark.parametrize("closed", ["left", "right", "both"]) + def test_pickle_round_trip_closed(self, closed): + # https://github.com/pandas-dev/pandas/issues/35658 + idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..1b0816a9405cb9dd6ed81691e72012c948b898a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/interval/test_setops.py @@ -0,0 +1,208 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + IntervalIndex, + Timestamp, + interval_range, +) +import pandas._testing as tm + + +def monotonic_index(start, end, dtype="int64", closed="right"): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) + + +def empty_index(dtype="int64", closed="right"): + return IntervalIndex(np.array([], dtype=dtype), closed=closed) + + +class TestIntervalIndex: + def test_union(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(0, 13, closed=closed) + result = index[::-1].union(other, sort=sort) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + result = other[::-1].union(index, sort=sort) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + tm.assert_index_equal(index.union(index, sort=sort), index) + tm.assert_index_equal(index.union(index[:1], sort=sort), index) + + def test_union_empty_result(self, closed, sort): + # GH 19101: empty result, same dtype + index = empty_index(dtype="int64", closed=closed) + result = index.union(index, sort=sort) + tm.assert_index_equal(result, index) + + # GH 19101: empty result, different numeric dtypes -> common dtype is f8 + other = empty_index(dtype="float64", closed=closed) + result = index.union(other, sort=sort) + expected = other + tm.assert_index_equal(result, expected) + + other = index.union(index, sort=sort) + tm.assert_index_equal(result, expected) + + other = empty_index(dtype="uint64", closed=closed) + result = index.union(other, sort=sort) + tm.assert_index_equal(result, expected) + + result = other.union(index, sort=sort) + tm.assert_index_equal(result, expected) + + def test_intersection(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(5, 11, closed=closed) + result = index[::-1].intersection(other, sort=sort) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + result = other[::-1].intersection(index, sort=sort) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + tm.assert_index_equal(index.intersection(index, sort=sort), index) + + # GH 26225: nested intervals + index = IntervalIndex.from_tuples([(1, 2), (1, 3), (1, 4), (0, 2)]) + other = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225 + index = IntervalIndex.from_tuples([(0, 3), (0, 2)]) + other = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(0, 2)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate nan element + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + expected = IntervalIndex([np.nan]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + def test_intersection_empty_result(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + + # GH 19101: empty result, same dtype + other = monotonic_index(300, 314, closed=closed) + expected = empty_index(dtype="int64", closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different numeric dtypes -> common dtype is float64 + other = monotonic_index(300, 314, dtype="float64", closed=closed) + result = index.intersection(other, sort=sort) + expected = other[:0] + tm.assert_index_equal(result, expected) + + other = monotonic_index(300, 314, dtype="uint64", closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + def test_intersection_duplicates(self): + # GH#38743 + index = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3), (3, 4)]) + other = IntervalIndex.from_tuples([(1, 2), (2, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (2, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + def test_difference(self, closed, sort): + index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) + result = index.difference(index[:1], sort=sort) + expected = index[1:] + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, same dtype + result = index.difference(index, sort=sort) + expected = empty_index(dtype="int64", closed=closed) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) + result = index.difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + result = index[1:].symmetric_difference(index[:-1], sort=sort) + expected = IntervalIndex([index[0], index[-1]]) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + # GH 19101: empty result, same dtype + result = index.symmetric_difference(index, sort=sort) + expected = empty_index(dtype="int64", closed=closed) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) + result = index.symmetric_difference(other, sort=sort) + expected = empty_index(dtype="float64", closed=closed) + tm.assert_index_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:'<' not supported between:RuntimeWarning") + @pytest.mark.parametrize( + "op_name", ["union", "intersection", "difference", "symmetric_difference"] + ) + def test_set_incompatible_types(self, closed, op_name, sort): + index = monotonic_index(0, 11, closed=closed) + set_op = getattr(index, op_name) + + # TODO: standardize return type of non-union setops type(self vs other) + # non-IntervalIndex + if op_name == "difference": + expected = index + else: + expected = getattr(index.astype("O"), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) + + # mixed closed -> cast to object + for other_closed in {"right", "left", "both", "neither"} - {closed}: + other = monotonic_index(0, 11, closed=other_closed) + expected = getattr(index.astype(object), op_name)(other, sort=sort) + if op_name == "difference": + expected = index + result = set_op(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 19016: incompatible dtypes -> cast to object + other = interval_range(Timestamp("20180101"), periods=9, closed=closed) + expected = getattr(index.astype(object), op_name)(other, sort=sort) + if op_name == "difference": + expected = index + result = set_op(other, sort=sort) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2c53f12a3e7d0681c921cf77d969789acc70ec6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c70f832dc282a6b474e172465892e13fd1807419 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_analytics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_analytics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bbeb3a7d06bc8cb87af1b63e033b3c9d0eff0b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_analytics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..567d4e8edb3e68c34515a9254a4147f59439e4c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..107dc5abf2a0cc8c6c8ab7b31e207caeba78d5e5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32bb23242c1739fb66e18fbb73b4755138804c9b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_conversion.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_conversion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ae2e46c071ac34e56f136f84e629e17f78e7f1b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_conversion.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_copy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_copy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4e019f97726a36f18fd834eadc34ec87aaf7521 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_copy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_drop.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_drop.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..385a8de1a65c81d0829b86abf8f38899966ed64c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_drop.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_duplicates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_duplicates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a786925d48c5daeb9f3c40858bc09f14038e5210 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_duplicates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_equivalence.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_equivalence.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..563c2cd4e9edf6e4f9c6f9f8be2956f33068a378 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_equivalence.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32b16e276c1c3f1cbc303fb68e57f1e10ad0aca5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_level_values.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_level_values.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..410ff4c477cca58c4b1d87eaa6ec5c47630dd37f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_level_values.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_set.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_set.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2afe41276b1f6ef8cb0edb8577a88801ebcfc940 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_get_set.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95cbfbaf6ecaca67bdbd66beeb17a2df6c2b65e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_integrity.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_integrity.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c5df7493c3bdfe11fbf714891e4095fb28ae70d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_integrity.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_isin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_isin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a30e87362ffc9efc6fd1fa9f79c8ebcca770d429 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_isin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5091e6602296a89ba51a14f4695754a23fecc75a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_lexsort.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_lexsort.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c4d0bc50440600818dd6fe2435c3ed743881a0d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_lexsort.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfc7e01dd78332bb618002c12e1c748feab28616 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_monotonic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_monotonic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c5e9f414722250e51f92c322fd39d9dc48c62a7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_monotonic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_names.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_names.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2232db812ae860c61f686a1c62450d386cf480d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_names.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_partial_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_partial_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..063e3f64304c68867100cf5802c3a1d64bd70fb1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_partial_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00d68f0d9717b19cc59761254a345587522547fa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1da6f708b5e3b5ce299ad27206d775b8904ddd1e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reshape.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reshape.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f45040970f53f385aba4c7eb74b405a41cf72764 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_reshape.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4523c8d4c905ce0f5a47f441a52841a5db941900 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_sorting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_sorting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..392d3b8b8c95073e6436b24d2bb454751e81f493 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_sorting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_take.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_take.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adb3d9386b350376a282016e2f5dda3789e51983 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/__pycache__/test_take.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/conftest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..15062aee56e3a1b91d1f6eb76a4f86e381e0ad44 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/conftest.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + MultiIndex, +) + + +# Note: identical the "multi" entry in the top-level "index" fixture +@pytest.fixture +def idx(): + # a MultiIndex used to test the general functionality of the + # general functionality of this object + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_analytics.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_analytics.py new file mode 100644 index 0000000000000000000000000000000000000000..87f1439db5fc87c3be08e3675df1dae0fdb5554d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_analytics.py @@ -0,0 +1,263 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + MultiIndex, + date_range, + period_range, +) +import pandas._testing as tm + + +def test_infer_objects(idx): + with pytest.raises(NotImplementedError, match="to_frame"): + idx.infer_objects() + + +def test_shift(idx): + # GH8083 test the base class for shift + msg = ( + "This method is only implemented for DatetimeIndex, PeriodIndex and " + "TimedeltaIndex; Got type MultiIndex" + ) + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1) + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1, 2) + + +def test_groupby(idx): + groups = idx.groupby(np.array([1, 1, 1, 2, 2, 2])) + labels = idx.tolist() + exp = {1: labels[:3], 2: labels[3:]} + tm.assert_dict_equal(groups, exp) + + # GH5620 + groups = idx.groupby(idx) + exp = {key: [key] for key in idx} + tm.assert_dict_equal(groups, exp) + + +def test_truncate_multiindex(): + # GH 34564 for MultiIndex level names check + major_axis = Index(list(range(4))) + minor_axis = Index(list(range(2))) + + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["L1", "L2"], + ) + + result = index.truncate(before=1) + assert "foo" not in result.levels[0] + assert 1 in result.levels[0] + assert index.names == result.names + + result = index.truncate(after=1) + assert 2 not in result.levels[0] + assert 1 in result.levels[0] + assert index.names == result.names + + result = index.truncate(before=1, after=2) + assert len(result.levels[0]) == 2 + assert index.names == result.names + + msg = "after < before" + with pytest.raises(ValueError, match=msg): + index.truncate(3, 1) + + +# TODO: reshape + + +def test_reorder_levels(idx): + # this blows up + with pytest.raises(IndexError, match="^Too many levels"): + idx.reorder_levels([2, 1, 0]) + + +def test_numpy_repeat(): + reps = 2 + numbers = [1, 2, 3] + names = np.array(["foo", "bar"]) + + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) + tm.assert_index_equal(np.repeat(m, reps), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(m, reps, axis=1) + + +def test_append_mixed_dtypes(): + # GH 13660 + dti = date_range("2011-01-01", freq="ME", periods=3) + dti_tz = date_range("2011-01-01", freq="ME", periods=3, tz="US/Eastern") + pi = period_range("2011-01", freq="M", periods=3) + + mi = MultiIndex.from_arrays( + [[1, 2, 3], [1.1, np.nan, 3.3], ["a", "b", "c"], dti, dti_tz, pi] + ) + assert mi.nlevels == 6 + + res = mi.append(mi) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ["a", "b", "c", "a", "b", "c"], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi), + ] + ) + tm.assert_index_equal(res, exp) + + other = MultiIndex.from_arrays( + [ + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ] + ) + + res = mi.append(other) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, "x", "y", "z"], + [1.1, np.nan, 3.3, "x", "y", "z"], + ["a", "b", "c", "x", "y", "z"], + dti.append(Index(["x", "y", "z"])), + dti_tz.append(Index(["x", "y", "z"])), + pi.append(Index(["x", "y", "z"])), + ] + ) + tm.assert_index_equal(res, exp) + + +def test_iter(idx): + result = list(idx) + expected = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + assert result == expected + + +def test_sub(idx): + first = idx + + # - now raises (previously was set op difference) + msg = "cannot perform __sub__ with this index type: MultiIndex" + with pytest.raises(TypeError, match=msg): + first - idx[-3:] + with pytest.raises(TypeError, match=msg): + idx[-3:] - first + with pytest.raises(TypeError, match=msg): + idx[-3:] - first.tolist() + msg = "cannot perform __rsub__ with this index type: MultiIndex" + with pytest.raises(TypeError, match=msg): + first.tolist() - idx[-3:] + + +def test_map(idx): + # callable + index = idx + + result = index.map(lambda x: x) + tm.assert_index_equal(result, index) + + +@pytest.mark.parametrize( + "mapper", + [ + lambda values, idx: {i: e for e, i in zip(values, idx)}, + lambda values, idx: pd.Series(values, idx), + ], +) +def test_map_dictlike(idx, mapper): + identity = mapper(idx.values, idx) + + # we don't infer to uint64 dtype for a dict + if idx.dtype == np.uint64 and isinstance(identity, dict): + expected = idx.astype("int64") + else: + expected = idx + + result = idx.map(identity) + tm.assert_index_equal(result, expected) + + # empty mappable + expected = Index([np.nan] * len(idx)) + result = idx.map(mapper(expected, idx)) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda func: func.__name__, +) +def test_numpy_ufuncs(idx, func): + # test ufuncs of numpy. see: + # https://numpy.org/doc/stable/reference/ufuncs.html + + expected_exception = TypeError + msg = ( + "loop of ufunc does not support argument 0 of type tuple which " + f"has no callable {func.__name__} method" + ) + with pytest.raises(expected_exception, match=msg): + func(idx) + + +@pytest.mark.parametrize( + "func", + [np.isfinite, np.isinf, np.isnan, np.signbit], + ids=lambda func: func.__name__, +) +def test_numpy_type_funcs(idx, func): + msg = ( + f"ufunc '{func.__name__}' not supported for the input types, and the inputs " + "could not be safely coerced to any supported types according to " + "the casting rule ''safe''" + ) + with pytest.raises(TypeError, match=msg): + func(idx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..29908537fbe590328ac586e05f90f3cc24cab9ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_astype.py @@ -0,0 +1,30 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas._testing as tm + + +def test_astype(idx): + expected = idx.copy() + actual = idx.astype("O") + tm.assert_copy(actual.levels, expected.levels) + tm.assert_copy(actual.codes, expected.codes) + assert actual.names == list(expected.names) + + with pytest.raises(TypeError, match="^Setting.*dtype.*object"): + idx.astype(np.dtype(int)) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_astype_category(idx, ordered): + # GH 18630 + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + idx.astype(CategoricalDtype(ordered=ordered)) + + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + with pytest.raises(NotImplementedError, match=msg): + idx.astype("category") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..27a8c6e9b715880a57e711e8eab457ae553a4867 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_compat.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def test_numeric_compat(idx): + with pytest.raises(TypeError, match="cannot perform __mul__"): + idx * 1 + + with pytest.raises(TypeError, match="cannot perform __rmul__"): + 1 * idx + + div_err = "cannot perform __truediv__" + with pytest.raises(TypeError, match=div_err): + idx / 1 + + div_err = div_err.replace(" __", " __r") + with pytest.raises(TypeError, match=div_err): + 1 / idx + + with pytest.raises(TypeError, match="cannot perform __floordiv__"): + idx // 1 + + with pytest.raises(TypeError, match="cannot perform __rfloordiv__"): + 1 // idx + + +@pytest.mark.parametrize("method", ["all", "any", "__invert__"]) +def test_logical_compat(idx, method): + msg = f"cannot perform {method}" + + with pytest.raises(TypeError, match=msg): + getattr(idx, method)() + + +def test_inplace_mutation_resets_values(): + levels = [["a", "b", "c"], [4]] + levels2 = [[1, 2, 3], ["a"]] + codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] + + mi1 = MultiIndex(levels=levels, codes=codes) + mi2 = MultiIndex(levels=levels2, codes=codes) + + # instantiating MultiIndex should not access/cache _.values + assert "_values" not in mi1._cache + assert "_values" not in mi2._cache + + vals = mi1.values.copy() + vals2 = mi2.values.copy() + + # accessing .values should cache ._values + assert mi1._values is mi1._cache["_values"] + assert mi1.values is mi1._cache["_values"] + assert isinstance(mi1._cache["_values"], np.ndarray) + + # Make sure level setting works + new_vals = mi1.set_levels(levels2).values + tm.assert_almost_equal(vals2, new_vals) + + # Doesn't drop _values from _cache [implementation detail] + tm.assert_almost_equal(mi1._cache["_values"], vals) + + # ...and values is still same too + tm.assert_almost_equal(mi1.values, vals) + + # Make sure label setting works too + codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] + exp_values = np.empty((6,), dtype=object) + exp_values[:] = [(1, "a")] * 6 + + # Must be 1d array of tuples + assert exp_values.shape == (6,) + + new_mi = mi2.set_codes(codes2) + assert "_values" not in new_mi._cache + new_values = new_mi.values + assert "_values" in new_mi._cache + + # Shouldn't change cache + tm.assert_almost_equal(mi2._cache["_values"], vals2) + + # Should have correct values + tm.assert_almost_equal(exp_values, new_values) + + +def test_boxable_categorical_values(): + cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="h")) + result = MultiIndex.from_product([["a", "b", "c"], cat]).values + expected = pd.Series( + [ + ("a", pd.Timestamp("2012-01-01 00:00:00")), + ("a", pd.Timestamp("2012-01-01 01:00:00")), + ("a", pd.Timestamp("2012-01-01 02:00:00")), + ("b", pd.Timestamp("2012-01-01 00:00:00")), + ("b", pd.Timestamp("2012-01-01 01:00:00")), + ("b", pd.Timestamp("2012-01-01 02:00:00")), + ("c", pd.Timestamp("2012-01-01 00:00:00")), + ("c", pd.Timestamp("2012-01-01 01:00:00")), + ("c", pd.Timestamp("2012-01-01 02:00:00")), + ] + ).values + tm.assert_numpy_array_equal(result, expected) + result = pd.DataFrame({"a": ["a", "b", "c"], "b": cat, "c": np.array(cat)}).values + expected = pd.DataFrame( + { + "a": ["a", "b", "c"], + "b": [ + pd.Timestamp("2012-01-01 00:00:00"), + pd.Timestamp("2012-01-01 01:00:00"), + pd.Timestamp("2012-01-01 02:00:00"), + ], + "c": [ + pd.Timestamp("2012-01-01 00:00:00"), + pd.Timestamp("2012-01-01 01:00:00"), + pd.Timestamp("2012-01-01 02:00:00"), + ], + } + ).values + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..8456e6a7acba5a160556f6f3192fa795ce678786 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_constructors.py @@ -0,0 +1,860 @@ +from datetime import ( + date, + datetime, +) +import itertools + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + +import pandas as pd +from pandas import ( + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +def test_constructor_single_level(): + result = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) + assert isinstance(result, MultiIndex) + expected = Index(["foo", "bar", "baz", "qux"], name="first") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["first"] + + +def test_constructor_no_levels(): + msg = "non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex(levels=[], codes=[]) + + msg = "Must pass both levels and codes" + with pytest.raises(TypeError, match=msg): + MultiIndex(levels=[]) + with pytest.raises(TypeError, match=msg): + MultiIndex(codes=[]) + + +def test_constructor_nonhashable_names(): + # GH 20527 + levels = [[1, 2], ["one", "two"]] + codes = [[0, 0, 1, 1], [0, 1, 0, 1]] + names = (["foo"], ["bar"]) + msg = r"MultiIndex\.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + MultiIndex(levels=levels, codes=codes, names=names) + + # With .rename() + mi = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=("foo", "bar"), + ) + renamed = [["fooo"], ["barr"]] + with pytest.raises(TypeError, match=msg): + mi.rename(names=renamed) + + # With .set_names() + with pytest.raises(TypeError, match=msg): + mi.set_names(names=renamed) + + +def test_constructor_mismatched_codes_levels(idx): + codes = [np.array([1]), np.array([2]), np.array([3])] + levels = ["a"] + + msg = "Length of levels and codes must be the same" + with pytest.raises(ValueError, match=msg): + MultiIndex(levels=levels, codes=codes) + + length_error = ( + r"On level 0, code max \(3\) >= length of level \(1\)\. " + "NOTE: this index is in an inconsistent state" + ) + label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" + + # important to check that it's looking at the right thing. + with pytest.raises(ValueError, match=length_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) + + with pytest.raises(ValueError, match=label_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 0, 0, 0], [0, 0]]) + + # external API + with pytest.raises(ValueError, match=length_error): + idx.copy().set_levels([["a"], ["b"]]) + + with pytest.raises(ValueError, match=label_error): + idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with missing values (NaN, NaT, None) + result = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[-1, -1, -1, -1, 3, 4]] + ) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[-1, -1, 1, -1, 3, -1]] + ) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]] + ).set_levels([[np.nan, "s", pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[1, 2, 2, 2, 2, 2]] + ).set_codes([[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + + +def test_copy_in_constructor(): + levels = np.array(["a", "b", "c"]) + codes = np.array([1, 1, 2, 0, 0, 1, 1]) + val = codes[0] + mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], copy=True) + assert mi.codes[0][0] == val + codes[0] = 15 + assert mi.codes[0][0] == val + val = levels[0] + levels[0] = "PANDA" + assert mi.levels[0][0] == val + + +# ---------------------------------------------------------------------------- +# from_arrays +# ---------------------------------------------------------------------------- +def test_from_arrays(idx): + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] + + # list of arrays as input + result = MultiIndex.from_arrays(arrays, names=idx.names) + tm.assert_index_equal(result, idx) + + # infer correctly + result = MultiIndex.from_arrays([[pd.NaT, Timestamp("20130101")], ["a", "b"]]) + assert result.levels[0].equals(Index([Timestamp("20130101")])) + assert result.levels[1].equals(Index(["a", "b"])) + + +def test_from_arrays_iterator(idx): + # GH 18434 + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] + + # iterator as input + result = MultiIndex.from_arrays(iter(arrays), names=idx.names) + tm.assert_index_equal(result, idx) + + # invalid iterator input + msg = "Input must be a list / sequence of array-likes." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_arrays(0) + + +def test_from_arrays_tuples(idx): + arrays = tuple( + tuple(np.asarray(lev).take(level_codes)) + for lev, level_codes in zip(idx.levels, idx.codes) + ) + + # tuple of tuples as input + result = MultiIndex.from_arrays(arrays, names=idx.names) + tm.assert_index_equal(result, idx) + + +@pytest.mark.parametrize( + ("idx1", "idx2"), + [ + ( + pd.period_range("2011-01-01", freq="D", periods=3), + pd.period_range("2015-01-01", freq="h", periods=3), + ), + ( + date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), + date_range("2015-01-01 10:00", freq="h", periods=3, tz="Asia/Tokyo"), + ), + ( + pd.timedelta_range("1 days", freq="D", periods=3), + pd.timedelta_range("2 hours", freq="h", periods=3), + ), + ], +) +def test_from_arrays_index_series_period_datetimetz_and_timedelta(idx1, idx2): + result = MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = MultiIndex.from_arrays([Series(idx1), Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_datetimelike_mixed(): + idx1 = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = date_range("2015-01-01 10:00", freq="h", periods=3) + idx3 = pd.timedelta_range("1 days", freq="D", periods=3) + idx4 = pd.period_range("2011-01-01", freq="D", periods=3) + + result = MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + tm.assert_index_equal(result.get_level_values(2), idx3) + tm.assert_index_equal(result.get_level_values(3), idx4) + + result2 = MultiIndex.from_arrays( + [Series(idx1), Series(idx2), Series(idx3), Series(idx4)] + ) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + tm.assert_index_equal(result2.get_level_values(2), idx3) + tm.assert_index_equal(result2.get_level_values(3), idx4) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_series_categorical(): + # GH13743 + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=True) + + result = MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = MultiIndex.from_arrays([Series(idx1), Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + result3 = MultiIndex.from_arrays([idx1.values, idx2.values]) + tm.assert_index_equal(result3.get_level_values(0), idx1) + tm.assert_index_equal(result3.get_level_values(1), idx2) + + +def test_from_arrays_empty(): + # 0 levels + msg = "Must pass non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_arrays(arrays=[]) + + # 1 level + result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) + assert isinstance(result, MultiIndex) + expected = Index([], name="A") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] + + # N levels + for N in [2, 3]: + arrays = [[]] * N + names = list("ABC")[:N] + result = MultiIndex.from_arrays(arrays=arrays, names=names) + expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "invalid_sequence_of_arrays", + [ + 1, + [1], + [1, 2], + [[1], 2], + [1, [2]], + "a", + ["a"], + ["a", "b"], + [["a"], "b"], + (1,), + (1, 2), + ([1], 2), + (1, [2]), + "a", + ("a",), + ("a", "b"), + (["a"], "b"), + [(1,), 2], + [1, (2,)], + [("a",), "b"], + ((1,), 2), + (1, (2,)), + (("a",), "b"), + ], +) +def test_from_arrays_invalid_input(invalid_sequence_of_arrays): + msg = "Input must be a list / sequence of array-likes" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays) + + +@pytest.mark.parametrize( + "idx1, idx2", [([1, 2, 3], ["a", "b"]), ([], ["a", "b"]), ([1, 2, 3], [])] +) +def test_from_arrays_different_lengths(idx1, idx2): + # see gh-13599 + msg = "^all arrays must be same length$" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_arrays([idx1, idx2]) + + +def test_from_arrays_respects_none_names(): + # GH27292 + a = Series([1, 2, 3], name="foo") + b = Series(["a", "b", "c"], name="bar") + + result = MultiIndex.from_arrays([a, b], names=None) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b", "c"]], codes=[[0, 1, 2], [0, 1, 2]], names=None + ) + + tm.assert_index_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# from_tuples +# ---------------------------------------------------------------------------- +def test_from_tuples(): + msg = "Cannot infer number of levels from empty list" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_tuples([]) + + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) + + # input tuples + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=["a", "b"]) + tm.assert_index_equal(result, expected) + + +def test_from_tuples_iterator(): + # GH 18434 + # input iterator for tuples + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) + + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"]) + tm.assert_index_equal(result, expected) + + # input non-iterables + msg = "Input must be a list / sequence of tuple-likes." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_tuples(0) + + +def test_from_tuples_empty(): + # GH 16777 + result = MultiIndex.from_tuples([], names=["a", "b"]) + expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) + tm.assert_index_equal(result, expected) + + +def test_from_tuples_index_values(idx): + result = MultiIndex.from_tuples(idx) + assert (result.values == idx.values).all() + + +def test_tuples_with_name_string(): + # GH 15110 and GH 14848 + + li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + Index(li, name="abc") + with pytest.raises(ValueError, match=msg): + Index(li, name="a") + + +def test_from_tuples_with_tuple_label(): + # GH 15457 + expected = pd.DataFrame( + [[2, 1, 2], [4, (1, 2), 3]], columns=["a", "b", "c"] + ).set_index(["a", "b"]) + idx = MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=("a", "b")) + result = pd.DataFrame([2, 3], columns=["c"], index=idx) + tm.assert_frame_equal(expected, result) + + +# ---------------------------------------------------------------------------- +# from_product +# ---------------------------------------------------------------------------- +def test_from_product_empty_zero_levels(): + # 0 levels + msg = "Must pass non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_product([]) + + +def test_from_product_empty_one_level(): + result = MultiIndex.from_product([[]], names=["A"]) + expected = Index([], name="A") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] + + +@pytest.mark.parametrize( + "first, second", [([], []), (["foo", "bar", "baz"], []), ([], ["a", "b", "c"])] +) +def test_from_product_empty_two_levels(first, second): + names = ["A", "B"] + result = MultiIndex.from_product([first, second], names=names) + expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("N", list(range(4))) +def test_from_product_empty_three_levels(N): + # GH12258 + names = ["A", "B", "C"] + lvl2 = list(range(N)) + result = MultiIndex.from_product([[], lvl2, []], names=names) + expected = MultiIndex(levels=[[], lvl2, []], codes=[[], [], []], names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "invalid_input", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] +) +def test_from_product_invalid_input(invalid_input): + msg = r"Input must be a list / sequence of iterables|Input must be list-like" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_product(iterables=invalid_input) + + +def test_from_product_datetimeindex(): + dt_index = date_range("2000-01-01", periods=2) + mi = MultiIndex.from_product([[1, 2], dt_index]) + etalon = construct_1d_object_array_from_listlike( + [ + (1, Timestamp("2000-01-01")), + (1, Timestamp("2000-01-02")), + (2, Timestamp("2000-01-01")), + (2, Timestamp("2000-01-02")), + ] + ) + tm.assert_numpy_array_equal(mi.values, etalon) + + +def test_from_product_rangeindex(): + # RangeIndex is preserved by factorize, so preserved in levels + rng = Index(range(5)) + other = ["a", "b"] + mi = MultiIndex.from_product([rng, other]) + tm.assert_index_equal(mi._levels[0], rng, exact=True) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize("f", [lambda x: x, lambda x: Series(x), lambda x: x.values]) +def test_from_product_index_series_categorical(ordered, f): + # GH13743 + first = ["foo", "bar"] + + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=ordered) + expected = pd.CategoricalIndex( + list("abcaab") + list("abcaab"), categories=list("bac"), ordered=ordered + ) + + result = MultiIndex.from_product([first, f(idx)]) + tm.assert_index_equal(result.get_level_values(1), expected) + + +def test_from_product(): + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + result = MultiIndex.from_product([first, second], names=names) + + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] + expected = MultiIndex.from_tuples(tuples, names=names) + + tm.assert_index_equal(result, expected) + + +def test_from_product_iterator(): + # GH 18434 + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] + expected = MultiIndex.from_tuples(tuples, names=names) + + # iterator as input + result = MultiIndex.from_product(iter([first, second]), names=names) + tm.assert_index_equal(result, expected) + + # Invalid non-iterable input + msg = "Input must be a list / sequence of iterables." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_product(0) + + +@pytest.mark.parametrize( + "a, b, expected_names", + [ + ( + Series([1, 2, 3], name="foo"), + Series(["a", "b"], name="bar"), + ["foo", "bar"], + ), + (Series([1, 2, 3], name="foo"), ["a", "b"], ["foo", None]), + ([1, 2, 3], ["a", "b"], None), + ], +) +def test_from_product_infer_names(a, b, expected_names): + # GH27292 + result = MultiIndex.from_product([a, b]) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b"]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=expected_names, + ) + tm.assert_index_equal(result, expected) + + +def test_from_product_respects_none_names(): + # GH27292 + a = Series([1, 2, 3], name="foo") + b = Series(["a", "b"], name="bar") + + result = MultiIndex.from_product([a, b], names=None) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b"]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=None, + ) + tm.assert_index_equal(result, expected) + + +def test_from_product_readonly(): + # GH#15286 passing read-only array to from_product + a = np.array(range(3)) + b = ["a", "b"] + expected = MultiIndex.from_product([a, b]) + + a.setflags(write=False) + result = MultiIndex.from_product([a, b]) + tm.assert_index_equal(result, expected) + + +def test_create_index_existing_name(idx): + # GH11193, when an existing index is passed, and a new name is not + # specified, the new index should inherit the previous object name + index = idx + index.names = ["foo", "bar"] + result = Index(index) + expected = Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ) + ) + tm.assert_index_equal(result, expected) + + result = Index(index, name="A") + expected = Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + name="A", + ) + tm.assert_index_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# from_frame +# ---------------------------------------------------------------------------- +def test_from_frame(): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=["L1", "L2"] + ) + expected = MultiIndex.from_tuples( + [("a", "a"), ("a", "b"), ("b", "a"), ("b", "b")], names=["L1", "L2"] + ) + result = MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +def test_from_frame_missing_values_multiIndex(): + # GH 39984 + pa = pytest.importorskip("pyarrow") + + df = pd.DataFrame( + { + "a": Series([1, 2, None], dtype="Int64"), + "b": pd.Float64Dtype().__from_arrow__(pa.array([0.2, np.nan, None])), + } + ) + multi_indexed = MultiIndex.from_frame(df) + expected = MultiIndex.from_arrays( + [ + Series([1, 2, None]).astype("Int64"), + pd.Float64Dtype().__from_arrow__(pa.array([0.2, np.nan, None])), + ], + names=["a", "b"], + ) + tm.assert_index_equal(multi_indexed, expected) + + +@pytest.mark.parametrize( + "non_frame", + [ + Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27, + ], +) +def test_from_frame_error(non_frame): + # GH 22420 + with pytest.raises(TypeError, match="Input must be a DataFrame"): + MultiIndex.from_frame(non_frame) + + +def test_from_frame_dtype_fidelity(): + # GH 22420 + df = pd.DataFrame( + { + "dates": date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } + ) + original_dtypes = df.dtypes.to_dict() + + expected_mi = MultiIndex.from_arrays( + [ + date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + mi = MultiIndex.from_frame(df) + mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + tm.assert_index_equal(expected_mi, mi) + assert original_dtypes == mi_dtypes + + +@pytest.mark.parametrize( + "names_in,names_out", [(None, [("L1", "x"), ("L2", "y")]), (["x", "y"], ["x", "y"])] +) +def test_from_frame_valid_names(names_in, names_out): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) + mi = MultiIndex.from_frame(df, names=names_in) + assert mi.names == names_out + + +@pytest.mark.parametrize( + "names,expected_error_msg", + [ + ("bad_input", "Names should be list-like for a MultiIndex"), + (["a", "b", "c"], "Length of names must match number of levels in MultiIndex"), + ], +) +def test_from_frame_invalid_names(names, expected_error_msg): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) + with pytest.raises(ValueError, match=expected_error_msg): + MultiIndex.from_frame(df, names=names) + + +def test_index_equal_empty_iterable(): + # #16844 + a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"]) + b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) + tm.assert_index_equal(a, b) + + +def test_raise_invalid_sortorder(): + # Test that the MultiIndex constructor raise when a incorrect sortorder is given + # GH#28518 + + levels = [[0, 1], [0, 1, 2]] + + # Correct sortorder + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + + with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2 + ) + + with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): + MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1 + ) + + +def test_datetimeindex(): + idx1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" + ) + idx2 = date_range("2010/01/01", periods=6, freq="ME", tz="US/Eastern") + idx = MultiIndex.from_arrays([idx1, idx2]) + + expected1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" + ) + + tm.assert_index_equal(idx.levels[0], expected1) + tm.assert_index_equal(idx.levels[1], idx2) + + # from datetime combos + # GH 7888 + date1 = np.datetime64("today") + date2 = datetime.today() + date3 = Timestamp.today() + + for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): + index = MultiIndex.from_product([[d1], [d2]]) + assert isinstance(index.levels[0], pd.DatetimeIndex) + assert isinstance(index.levels[1], pd.DatetimeIndex) + + # but NOT date objects, matching Index behavior + date4 = date.today() + index = MultiIndex.from_product([[date4], [date2]]) + assert not isinstance(index.levels[0], pd.DatetimeIndex) + assert isinstance(index.levels[1], pd.DatetimeIndex) + + +def test_constructor_with_tz(): + index = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ) + columns = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ) + + result = MultiIndex.from_arrays([index, columns]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + result = MultiIndex.from_arrays([Series(index), Series(columns)]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + +def test_multiindex_inference_consistency(): + # check that inference behavior matches the base class + + v = date.today() + + arr = [v, v] + + idx = Index(arr) + assert idx.dtype == object + + mi = MultiIndex.from_arrays([arr]) + lev = mi.levels[0] + assert lev.dtype == object + + mi = MultiIndex.from_product([arr]) + lev = mi.levels[0] + assert lev.dtype == object + + mi = MultiIndex.from_tuples([(x,) for x in arr]) + lev = mi.levels[0] + assert lev.dtype == object + + +def test_dtype_representation(using_infer_string): + # GH#46900 + pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) + result = pmidx.dtypes + exp = "object" if not using_infer_string else "string" + expected = Series( + ["int64", exp], + index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), + dtype=object, + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_conversion.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_conversion.py new file mode 100644 index 0000000000000000000000000000000000000000..3c2ca045d6f990837fae4d2b3d7bcbbf40e175e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_conversion.py @@ -0,0 +1,164 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm + + +def test_to_numpy(idx): + result = idx.to_numpy() + exp = idx.values + tm.assert_numpy_array_equal(result, exp) + + +def test_to_frame(): + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + + index = MultiIndex.from_tuples(tuples) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + # See GH-22580 + index = MultiIndex.from_tuples(tuples) + result = index.to_frame(index=False, name=["first", "second"]) + expected = DataFrame(tuples) + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + result = index.to_frame(name=["first", "second"]) + expected.index = index + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + msg = "'name' must be a list / sequence of column names." + with pytest.raises(TypeError, match=msg): + index.to_frame(name="first") + + msg = "'name' should have same length as number of levels on index." + with pytest.raises(ValueError, match=msg): + index.to_frame(name=["first"]) + + # Tests for datetime index + index = MultiIndex.from_product([range(5), pd.date_range("20130101", periods=3)]) + result = index.to_frame(index=False) + expected = DataFrame( + { + 0: np.repeat(np.arange(5, dtype="int64"), 3), + 1: np.tile(pd.date_range("20130101", periods=3), 5), + } + ) + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + # See GH-22580 + result = index.to_frame(index=False, name=["first", "second"]) + expected = DataFrame( + { + "first": np.repeat(np.arange(5, dtype="int64"), 3), + "second": np.tile(pd.date_range("20130101", periods=3), 5), + } + ) + tm.assert_frame_equal(result, expected) + + result = index.to_frame(name=["first", "second"]) + expected.index = index + tm.assert_frame_equal(result, expected) + + +def test_to_frame_dtype_fidelity(): + # GH 22420 + mi = MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + expected_df = DataFrame( + { + "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } + ) + df = mi.to_frame(index=False) + df_dtypes = df.dtypes.to_dict() + + tm.assert_frame_equal(df, expected_df) + assert original_dtypes == df_dtypes + + +def test_to_frame_resulting_column_order(): + # GH 22420 + expected = ["z", 0, "a"] + mi = MultiIndex.from_arrays( + [["a", "b", "c"], ["x", "y", "z"], ["q", "w", "e"]], names=expected + ) + result = mi.to_frame().columns.tolist() + assert result == expected + + +def test_to_frame_duplicate_labels(): + # GH 45245 + data = [(1, 2), (3, 4)] + names = ["a", "a"] + index = MultiIndex.from_tuples(data, names=names) + with pytest.raises(ValueError, match="Cannot create duplicate column labels"): + index.to_frame() + + result = index.to_frame(allow_duplicates=True) + expected = DataFrame(data, index=index, columns=names) + tm.assert_frame_equal(result, expected) + + names = [None, 0] + index = MultiIndex.from_tuples(data, names=names) + with pytest.raises(ValueError, match="Cannot create duplicate column labels"): + index.to_frame() + + result = index.to_frame(allow_duplicates=True) + expected = DataFrame(data, index=index, columns=[0, 0]) + tm.assert_frame_equal(result, expected) + + +def test_to_flat_index(idx): + expected = pd.Index( + ( + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ), + tupleize_cols=False, + ) + result = idx.to_flat_index() + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_copy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..2e09a580f9528bc8197d55c6a7533098e0129fa2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_copy.py @@ -0,0 +1,96 @@ +from copy import ( + copy, + deepcopy, +) + +import pytest + +from pandas import MultiIndex +import pandas._testing as tm + + +def assert_multiindex_copied(copy, original): + # Levels should be (at least, shallow copied) + tm.assert_copy(copy.levels, original.levels) + tm.assert_almost_equal(copy.codes, original.codes) + + # Labels doesn't matter which way copied + tm.assert_almost_equal(copy.codes, original.codes) + assert copy.codes is not original.codes + + # Names doesn't matter which way copied + assert copy.names == original.names + assert copy.names is not original.names + + # Sort order should be copied + assert copy.sortorder == original.sortorder + + +def test_copy(idx): + i_copy = idx.copy() + + assert_multiindex_copied(i_copy, idx) + + +def test_shallow_copy(idx): + i_copy = idx._view() + + assert_multiindex_copied(i_copy, idx) + + +def test_view(idx): + i_view = idx.view() + assert_multiindex_copied(i_view, idx) + + +@pytest.mark.parametrize("func", [copy, deepcopy]) +def test_copy_and_deepcopy(func): + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + idx_copy = func(idx) + assert idx_copy is not idx + assert idx_copy.equals(idx) + + +@pytest.mark.parametrize("deep", [True, False]) +def test_copy_method(deep): + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + idx_copy = idx.copy(deep=deep) + assert idx_copy.equals(idx) + + +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize( + "kwarg, value", + [ + ("names", ["third", "fourth"]), + ], +) +def test_copy_method_kwargs(deep, kwarg, value): + # gh-12309: Check that the "name" argument as well other kwargs are honored + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + idx_copy = idx.copy(**{kwarg: value, "deep": deep}) + assert getattr(idx_copy, kwarg) == value + + +def test_copy_deep_false_retains_id(): + # GH#47878 + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + + res = idx.copy(deep=False) + assert res._id is idx._id diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_drop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_drop.py new file mode 100644 index 0000000000000000000000000000000000000000..99c8ebb1e57b22059d5a545a79de7b8348d73b14 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_drop.py @@ -0,0 +1,190 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + Index, + MultiIndex, +) +import pandas._testing as tm + + +def test_drop(idx): + dropped = idx.drop([("foo", "two"), ("qux", "one")]) + + index = MultiIndex.from_tuples([("foo", "two"), ("qux", "one")]) + dropped2 = idx.drop(index) + + expected = idx[[0, 2, 3, 5]] + tm.assert_index_equal(dropped, expected) + tm.assert_index_equal(dropped2, expected) + + dropped = idx.drop(["bar"]) + expected = idx[[0, 1, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop("foo") + expected = idx[[2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + index = MultiIndex.from_tuples([("bar", "two")]) + with pytest.raises(KeyError, match=r"^\('bar', 'two'\)$"): + idx.drop([("bar", "two")]) + with pytest.raises(KeyError, match=r"^\('bar', 'two'\)$"): + idx.drop(index) + with pytest.raises(KeyError, match=r"^'two'$"): + idx.drop(["foo", "two"]) + + # partially correct argument + mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) + with pytest.raises(KeyError, match=r"^\('bar', 'two'\)$"): + idx.drop(mixed_index) + + # error='ignore' + dropped = idx.drop(index, errors="ignore") + expected = idx[[0, 1, 2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop(mixed_index, errors="ignore") + expected = idx[[0, 1, 2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop(["foo", "two"], errors="ignore") + expected = idx[[2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + # mixed partial / full drop + dropped = idx.drop(["foo", ("qux", "one")]) + expected = idx[[2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + # mixed partial / full drop / error='ignore' + mixed_index = ["foo", ("qux", "one"), "two"] + with pytest.raises(KeyError, match=r"^'two'$"): + idx.drop(mixed_index) + dropped = idx.drop(mixed_index, errors="ignore") + expected = idx[[2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + +def test_droplevel_with_names(idx): + index = idx[idx.get_loc("foo")] + dropped = index.droplevel(0) + assert dropped.name == "second" + + index = MultiIndex( + levels=[Index(range(4)), Index(range(4)), Index(range(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + dropped = index.droplevel(0) + assert dropped.names == ("two", "three") + + dropped = index.droplevel("two") + expected = index.droplevel(1) + assert dropped.equals(expected) + + +def test_droplevel_list(): + index = MultiIndex( + levels=[Index(range(4)), Index(range(4)), Index(range(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + + dropped = index[:2].droplevel(["three", "one"]) + expected = index[:2].droplevel(2).droplevel(0) + assert dropped.equals(expected) + + dropped = index[:2].droplevel([]) + expected = index[:2] + assert dropped.equals(expected) + + msg = ( + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left" + ) + with pytest.raises(ValueError, match=msg): + index[:2].droplevel(["one", "two", "three"]) + + with pytest.raises(KeyError, match="'Level four not found'"): + index[:2].droplevel(["one", "four"]) + + +def test_drop_not_lexsorted(): + # GH 12078 + + # define the lexsorted version of the multi-index + tuples = [("a", ""), ("b1", "c1"), ("b2", "c2")] + lexsorted_mi = MultiIndex.from_tuples(tuples, names=["b", "c"]) + assert lexsorted_mi._is_lexsorted() + + # and the not-lexsorted version + df = pd.DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + df = df.pivot_table(index="a", columns=["b", "c"], values="d") + df = df.reset_index() + not_lexsorted_mi = df.columns + assert not not_lexsorted_mi._is_lexsorted() + + # compare the results + tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) + + +def test_drop_with_nan_in_index(nulls_fixture): + # GH#18853 + mi = MultiIndex.from_tuples([("blah", nulls_fixture)], names=["name", "date"]) + msg = r"labels \[Timestamp\('2001-01-01 00:00:00'\)\] not found in level" + with pytest.raises(KeyError, match=msg): + mi.drop(pd.Timestamp("2001"), level="date") + + +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +def test_drop_with_non_monotonic_duplicates(): + # GH#33494 + mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)]) + result = mi.drop((1, 2)) + expected = MultiIndex.from_tuples([(2, 3)]) + tm.assert_index_equal(result, expected) + + +def test_single_level_drop_partially_missing_elements(): + # GH 37820 + + mi = MultiIndex.from_tuples([(1, 2), (2, 2), (3, 2)]) + msg = r"labels \[4\] not found in level" + with pytest.raises(KeyError, match=msg): + mi.drop(4, level=0) + with pytest.raises(KeyError, match=msg): + mi.drop([1, 4], level=0) + msg = r"labels \[nan\] not found in level" + with pytest.raises(KeyError, match=msg): + mi.drop([np.nan], level=0) + with pytest.raises(KeyError, match=msg): + mi.drop([np.nan, 1, 2, 3], level=0) + + mi = MultiIndex.from_tuples([(np.nan, 1), (1, 2)]) + msg = r"labels \['a'\] not found in level" + with pytest.raises(KeyError, match=msg): + mi.drop([np.nan, 1, "a"], level=0) + + +def test_droplevel_multiindex_one_level(): + # GH#37208 + index = MultiIndex.from_tuples([(2,)], names=("b",)) + result = index.droplevel([]) + expected = Index([2], name="b") + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_duplicates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..6c6d9022b1af31e905b5ec739753af77a52f438b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_duplicates.py @@ -0,0 +1,363 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas._libs import ( + hashtable, + index as libindex, +) + +from pandas import ( + NA, + DatetimeIndex, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + +@pytest.mark.parametrize("names", [None, ["first", "second"]]) +def test_unique(names): + mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) + + res = mi.unique() + exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) + tm.assert_index_equal(res, exp) + + mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names) + res = mi.unique() + exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names) + tm.assert_index_equal(res, exp) + + mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names) + res = mi.unique() + exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names) + tm.assert_index_equal(res, exp) + + # GH #20568 - empty MI + mi = MultiIndex.from_arrays([[], []], names=names) + res = mi.unique() + tm.assert_index_equal(mi, res) + + +def test_unique_datetimelike(): + idx1 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"] + ) + idx2 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"], + tz="Asia/Tokyo", + ) + result = MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"]) + eidx2 = DatetimeIndex( + ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo" + ) + exp = MultiIndex.from_arrays([eidx1, eidx2]) + tm.assert_index_equal(result, exp) + + +@pytest.mark.parametrize("level", [0, "first", 1, "second"]) +def test_unique_level(idx, level): + # GH #17896 - with level= argument + result = idx.unique(level=level) + expected = idx.get_level_values(level).unique() + tm.assert_index_equal(result, expected) + + # With already unique level + mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"]) + result = mi.unique(level=level) + expected = mi.get_level_values(level) + tm.assert_index_equal(result, expected) + + # With empty MI + mi = MultiIndex.from_arrays([[], []], names=["first", "second"]) + result = mi.unique(level=level) + expected = mi.get_level_values(level) + tm.assert_index_equal(result, expected) + + +def test_duplicate_multiindex_codes(): + # GH 17464 + # Make sure that a MultiIndex with duplicate levels throws a ValueError + msg = r"Level values must be unique: \[[A', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): + mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) + + # And that using set_levels with duplicate levels fails + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + msg = r"Level values must be unique: \[[AB', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]]) + + +@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) +def test_duplicate_level_names(names): + # GH18872, GH19029 + mi = MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names + + # With .rename() + mi = MultiIndex.from_product([[0, 1]] * 3) + mi = mi.rename(names) + assert mi.names == names + + # With .rename(., level=) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + +def test_duplicate_meta_data(): + # GH 10115 + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + + for idx in [ + mi, + mi.set_names([None, None]), + mi.set_names([None, "Num"]), + mi.set_names(["Upper", "Num"]), + ]: + assert idx.has_duplicates + assert idx.drop_duplicates().names == idx.names + + +def test_has_duplicates(idx, idx_dup): + # see fixtures + assert idx.is_unique is True + assert idx.has_duplicates is False + assert idx_dup.is_unique is False + assert idx_dup.has_duplicates is True + + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + assert mi.is_unique is False + assert mi.has_duplicates is True + + # single instance of NaN + mi_nan = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]] + ) + assert mi_nan.is_unique is True + assert mi_nan.has_duplicates is False + + # multiple instances of NaN + mi_nan_dup = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]] + ) + assert mi_nan_dup.is_unique is False + assert mi_nan_dup.has_duplicates is True + + +def test_has_duplicates_from_tuples(): + # GH 9075 + t = [ + ("x", "out", "z", 5, "y", "in", "z", 169), + ("x", "out", "z", 7, "y", "in", "z", 119), + ("x", "out", "z", 9, "y", "in", "z", 135), + ("x", "out", "z", 13, "y", "in", "z", 145), + ("x", "out", "z", 14, "y", "in", "z", 158), + ("x", "out", "z", 16, "y", "in", "z", 122), + ("x", "out", "z", 17, "y", "in", "z", 160), + ("x", "out", "z", 18, "y", "in", "z", 180), + ("x", "out", "z", 20, "y", "in", "z", 143), + ("x", "out", "z", 21, "y", "in", "z", 128), + ("x", "out", "z", 22, "y", "in", "z", 129), + ("x", "out", "z", 25, "y", "in", "z", 111), + ("x", "out", "z", 28, "y", "in", "z", 114), + ("x", "out", "z", 29, "y", "in", "z", 121), + ("x", "out", "z", 31, "y", "in", "z", 126), + ("x", "out", "z", 32, "y", "in", "z", 155), + ("x", "out", "z", 33, "y", "in", "z", 123), + ("x", "out", "z", 12, "y", "in", "z", 144), + ] + + mi = MultiIndex.from_tuples(t) + assert not mi.has_duplicates + + +@pytest.mark.parametrize("nlevels", [4, 8]) +@pytest.mark.parametrize("with_nulls", [True, False]) +def test_has_duplicates_overflow(nlevels, with_nulls): + # handle int64 overflow if possible + # no overflow with 4 + # overflow possible with 8 + codes = np.tile(np.arange(500), 2) + level = np.arange(500) + + if with_nulls: # inject some null values + codes[500] = -1 # common nan value + codes = [codes.copy() for i in range(nlevels)] + for i in range(nlevels): + codes[i][500 + i - nlevels // 2] = -1 + + codes += [np.array([-1, 1]).repeat(500)] + else: + codes = [codes] * nlevels + [np.arange(2).repeat(500)] + + levels = [level] * nlevels + [[0, 1]] + + # no dups + mi = MultiIndex(levels=levels, codes=codes) + assert not mi.has_duplicates + + # with a dup + if with_nulls: + + def f(a): + return np.insert(a, 1000, a[0]) + + codes = list(map(f, codes)) + mi = MultiIndex(levels=levels, codes=codes) + else: + values = mi.values.tolist() + mi = MultiIndex.from_tuples(values + [values[0]]) + + assert mi.has_duplicates + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", np.array([False, False, False, True, True, False])), + ("last", np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])), + ], +) +def test_duplicated(idx_dup, keep, expected): + result = idx_dup.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.arm_slow +def test_duplicated_hashtable_impl(keep, monkeypatch): + # GH 9125 + n, k = 6, 10 + levels = [np.arange(n), [str(i) for i in range(n)], 1000 + np.arange(n)] + codes = [np.random.default_rng(2).choice(n, k * n) for _ in levels] + with monkeypatch.context() as m: + m.setattr(libindex, "_SIZE_CUTOFF", 50) + mi = MultiIndex(levels=levels, codes=codes) + + result = mi.duplicated(keep=keep) + expected = hashtable.duplicated(mi.values, keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("val", [101, 102]) +def test_duplicated_with_nan(val): + # GH5873 + mi = MultiIndex.from_arrays([[101, val], [3.5, np.nan]]) + assert not mi.has_duplicates + + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) + + +@pytest.mark.parametrize("n", range(1, 6)) +@pytest.mark.parametrize("m", range(1, 5)) +def test_duplicated_with_nan_multi_shape(n, m): + # GH5873 + # all possible unique combinations, including nan + codes = product(range(-1, n), range(-1, m)) + mi = MultiIndex( + levels=[list("abcde")[:n], list("WXYZ")[:m]], + codes=np.random.default_rng(2).permutation(list(codes)).T, + ) + assert len(mi) == (n + 1) * (m + 1) + assert not mi.has_duplicates + + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype="bool")) + + +def test_duplicated_drop_duplicates(): + # GH#4060 + idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) + + expected = np.array([False, False, False, True, False, False], dtype=bool) + duplicated = idx.duplicated() + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(), expected) + + expected = np.array([True, False, False, False, False, False]) + duplicated = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) + + expected = np.array([True, False, False, True, False, False]) + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) + + +@pytest.mark.parametrize( + "dtype", + [ + np.complex64, + np.complex128, + ], +) +def test_duplicated_series_complex_numbers(dtype): + # GH 17927 + expected = Series( + [False, False, False, True, False, False, False, True, False, True], + dtype=bool, + ) + result = Series( + [ + np.nan + np.nan * 1j, + 0, + 1j, + 1j, + 1, + 1 + 1j, + 1 + 2j, + 1 + 1j, + np.nan, + np.nan + np.nan * 1j, + ], + dtype=dtype, + ).duplicated() + tm.assert_series_equal(result, expected) + + +def test_midx_unique_ea_dtype(): + # GH#48335 + vals_a = Series([1, 2, NA, NA], dtype="Int64") + vals_b = np.array([1, 2, 3, 3]) + midx = MultiIndex.from_arrays([vals_a, vals_b], names=["a", "b"]) + result = midx.unique() + + exp_vals_a = Series([1, 2, NA], dtype="Int64") + exp_vals_b = np.array([1, 2, 3]) + expected = MultiIndex.from_arrays([exp_vals_a, exp_vals_b], names=["a", "b"]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_equivalence.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_equivalence.py new file mode 100644 index 0000000000000000000000000000000000000000..9babbd5b8d56d64d704978758efb81f8d730274f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_equivalence.py @@ -0,0 +1,284 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_any_real_numeric_dtype + +import pandas as pd +from pandas import ( + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +def test_equals(idx): + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.equals(idx.to_flat_index()) + assert idx.equals(idx.to_flat_index().astype("category")) + + assert not idx.equals(list(idx)) + assert not idx.equals(np.array(idx)) + + same_values = Index(idx, dtype=object) + assert idx.equals(same_values) + assert same_values.equals(idx) + + if idx.nlevels == 1: + # do not test MultiIndex + assert not idx.equals(Series(idx)) + + +def test_equals_op(idx): + # GH9947, GH10637 + index_a = idx + + n = len(index_a) + index_b = index_a[0:-1] + index_c = index_a[0:-1].append(index_a[-2:-1]) + index_d = index_a[0:1] + with pytest.raises(ValueError, match="Lengths must match"): + index_a == index_b + expected1 = np.array([True] * n) + expected2 = np.array([True] * (n - 1) + [False]) + tm.assert_numpy_array_equal(index_a == index_a, expected1) + tm.assert_numpy_array_equal(index_a == index_c, expected2) + + # test comparisons with numpy arrays + array_a = np.array(index_a) + array_b = np.array(index_a[0:-1]) + array_c = np.array(index_a[0:-1].append(index_a[-2:-1])) + array_d = np.array(index_a[0:1]) + with pytest.raises(ValueError, match="Lengths must match"): + index_a == array_b + tm.assert_numpy_array_equal(index_a == array_a, expected1) + tm.assert_numpy_array_equal(index_a == array_c, expected2) + + # test comparisons with Series + series_a = Series(array_a) + series_b = Series(array_b) + series_c = Series(array_c) + series_d = Series(array_d) + with pytest.raises(ValueError, match="Lengths must match"): + index_a == series_b + + tm.assert_numpy_array_equal(index_a == series_a, expected1) + tm.assert_numpy_array_equal(index_a == series_c, expected2) + + # cases where length is 1 for one of them + with pytest.raises(ValueError, match="Lengths must match"): + index_a == index_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == array_d + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + series_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + series_a == array_d + + # comparing with a scalar should broadcast; note that we are excluding + # MultiIndex because in this case each item in the index is a tuple of + # length 2, and therefore is considered an array of length 2 in the + # comparison instead of a scalar + if not isinstance(index_a, MultiIndex): + expected3 = np.array([False] * (len(index_a) - 2) + [True, False]) + # assuming the 2nd to last item is unique in the data + item = index_a[-2] + tm.assert_numpy_array_equal(index_a == item, expected3) + tm.assert_series_equal(series_a == item, Series(expected3)) + + +def test_compare_tuple(): + # GH#21517 + mi = MultiIndex.from_product([[1, 2]] * 2) + + all_false = np.array([False, False, False, False]) + + result = mi == mi[0] + expected = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = mi != mi[0] + tm.assert_numpy_array_equal(result, ~expected) + + result = mi < mi[0] + tm.assert_numpy_array_equal(result, all_false) + + result = mi <= mi[0] + tm.assert_numpy_array_equal(result, expected) + + result = mi > mi[0] + tm.assert_numpy_array_equal(result, ~expected) + + result = mi >= mi[0] + tm.assert_numpy_array_equal(result, ~all_false) + + +def test_compare_tuple_strs(): + # GH#34180 + + mi = MultiIndex.from_tuples([("a", "b"), ("b", "c"), ("c", "a")]) + + result = mi == ("c", "a") + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = mi == ("c",) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + +def test_equals_multi(idx): + assert idx.equals(idx) + assert not idx.equals(idx.values) + assert idx.equals(Index(idx.values)) + + assert idx.equal_levels(idx) + assert not idx.equals(idx[:-1]) + assert not idx.equals(idx[-1]) + + # different number of levels + index = MultiIndex( + levels=[Index(list(range(4))), Index(list(range(4))), Index(list(range(4)))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + + index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1]) + assert not index.equals(index2) + assert not index.equal_levels(index2) + + # levels are different + major_axis = Index(list(range(4))) + minor_axis = Index(list(range(2))) + + major_codes = np.array([0, 0, 1, 2, 2, 3]) + minor_codes = np.array([0, 1, 0, 0, 1, 0]) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + assert not idx.equals(index) + assert not idx.equal_levels(index) + + # some of the labels are different + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + assert not idx.equals(index) + + +def test_identical(idx): + mi = idx.copy() + mi2 = idx.copy() + assert mi.identical(mi2) + + mi = mi.set_names(["new1", "new2"]) + assert mi.equals(mi2) + assert not mi.identical(mi2) + + mi2 = mi2.set_names(["new1", "new2"]) + assert mi.identical(mi2) + + mi4 = Index(mi.tolist(), tupleize_cols=False) + assert not mi.identical(mi4) + assert mi.equals(mi4) + + +def test_equals_operator(idx): + # GH9785 + assert (idx == idx).all() + + +def test_equals_missing_values(): + # make sure take is not using -1 + i = MultiIndex.from_tuples([(0, pd.NaT), (0, pd.Timestamp("20130101"))]) + result = i[0:1].equals(i[0]) + assert not result + result = i[1:2].equals(i[1]) + assert not result + + +def test_equals_missing_values_differently_sorted(): + # GH#38439 + mi1 = MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)]) + mi2 = MultiIndex.from_tuples([(np.nan, np.nan), (81.0, np.nan)]) + assert not mi1.equals(mi2) + + mi2 = MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)]) + assert mi1.equals(mi2) + + +def test_is_(): + mi = MultiIndex.from_tuples(zip(range(10), range(10))) + assert mi.is_(mi) + assert mi.is_(mi.view()) + assert mi.is_(mi.view().view().view().view()) + mi2 = mi.view() + # names are metadata, they don't change id + mi2.names = ["A", "B"] + assert mi2.is_(mi) + assert mi.is_(mi2) + + assert not mi.is_(mi.set_names(["C", "D"])) + # levels are inherent properties, they change identity + mi3 = mi2.set_levels([list(range(10)), list(range(10))]) + assert not mi3.is_(mi2) + # shouldn't change + assert mi2.is_(mi) + mi4 = mi3.view() + + # GH 17464 - Remove duplicate MultiIndex levels + mi4 = mi4.set_levels([list(range(10)), list(range(10))]) + assert not mi4.is_(mi3) + mi5 = mi.view() + mi5 = mi5.set_levels(mi5.levels) + assert not mi5.is_(mi) + + +def test_is_all_dates(idx): + assert not idx._is_all_dates + + +def test_is_numeric(idx): + # MultiIndex is never numeric + assert not is_any_real_numeric_dtype(idx) + + +def test_multiindex_compare(): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = Series([True, True]) + result = Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = Series([False, False]) + result = Series(midx > midx) + tm.assert_series_equal(result, expected) + + +def test_equals_ea_int_regular_int(): + # GH#46026 + mi1 = MultiIndex.from_arrays([Index([1, 2], dtype="Int64"), [3, 4]]) + mi2 = MultiIndex.from_arrays([[1, 2], [3, 4]]) + assert not mi1.equals(mi2) + assert not mi2.equals(mi1) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_formats.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..52ff3109128f24f43d9a12527d08770b463459a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_formats.py @@ -0,0 +1,249 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + MultiIndex, +) +import pandas._testing as tm + + +def test_format(idx): + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() + idx[:0].format() + + +def test_format_integer_names(): + index = MultiIndex( + levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] + ) + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.format(names=True) + + +def test_format_sparse_config(idx): + # GH1538 + msg = "MultiIndex.format is deprecated" + with pd.option_context("display.multi_sparse", False): + with tm.assert_produces_warning(FutureWarning, match=msg): + result = idx.format() + assert result[1] == "foo two" + + +def test_format_sparse_display(): + index = MultiIndex( + levels=[[0, 1], [0, 1], [0, 1], [0]], + codes=[ + [0, 0, 0, 1, 1, 1], + [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0], + ], + ) + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.format() + assert result[3] == "1 0 0 0" + + +def test_repr_with_unicode_data(): + with pd.option_context("display.encoding", "UTF-8"): + d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = pd.DataFrame(d).set_index(["a", "b"]).index + assert "\\" not in repr(index) # we don't want unicode-escaped + + +def test_repr_roundtrip_raises(): + mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) + msg = "Must pass both levels and codes" + with pytest.raises(TypeError, match=msg): + eval(repr(mi)) + + +def test_unicode_string_with_unicode(): + d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + idx = pd.DataFrame(d).set_index(["a", "b"]).index + str(idx) + + +def test_repr_max_seq_item_setting(idx): + # GH10182 + idx = idx.repeat(50) + with pd.option_context("display.max_seq_items", None): + repr(idx) + assert "..." not in str(idx) + + +class TestRepr: + def test_unicode_repr_issues(self): + levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] + codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, codes=codes) + + repr(index.levels) + repr(index.get_level_values(1)) + + def test_repr_max_seq_items_equal_to_n(self, idx): + # display.max_seq_items == n + with pd.option_context("display.max_seq_items", 6): + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ('bar', 'one'), + ('baz', 'two'), + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'])""" + assert result == expected + + def test_repr(self, idx): + result = idx[:1].__repr__() + expected = """\ +MultiIndex([('foo', 'one')], + names=['first', 'second'])""" + assert result == expected + + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ('bar', 'one'), + ('baz', 'two'), + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'])""" + assert result == expected + + with pd.option_context("display.max_seq_items", 5): + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ... + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'], length=6)""" + assert result == expected + + # display.max_seq_items == 1 + with pd.option_context("display.max_seq_items", 1): + result = idx.__repr__() + expected = """\ +MultiIndex([... + ('qux', 'two')], + names=['first', ...], length=6)""" + assert result == expected + + def test_rjust(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + mi = MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) + result = mi[:1].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi[::500].__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:08:20'), + ('abc', 10, '2000-01-01 00:16:40'), + ('abc', 10, '2000-01-01 00:25:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:00:01'), + ( 'a', 9, '2000-01-01 00:00:02'), + ( 'a', 9, '2000-01-01 00:00:03'), + ( 'a', 9, '2000-01-01 00:00:04'), + ( 'a', 9, '2000-01-01 00:00:05'), + ( 'a', 9, '2000-01-01 00:00:06'), + ( 'a', 9, '2000-01-01 00:00:07'), + ( 'a', 9, '2000-01-01 00:00:08'), + ( 'a', 9, '2000-01-01 00:00:09'), + ... + ('abc', 10, '2000-01-01 00:33:10'), + ('abc', 10, '2000-01-01 00:33:11'), + ('abc', 10, '2000-01-01 00:33:12'), + ('abc', 10, '2000-01-01 00:33:13'), + ('abc', 10, '2000-01-01 00:33:14'), + ('abc', 10, '2000-01-01 00:33:15'), + ('abc', 10, '2000-01-01 00:33:16'), + ('abc', 10, '2000-01-01 00:33:17'), + ('abc', 10, '2000-01-01 00:33:18'), + ('abc', 10, '2000-01-01 00:33:19')], + names=['a', 'b', 'dti'], length=2000)""" + assert result == expected + + def test_tuple_width(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] + mi = MultiIndex.from_arrays(levels, names=names) + result = mi[:1].__repr__() + expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" # noqa: E501 + assert result == expected + + result = mi[:10].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ( 'a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ( 'a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ( 'a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ( 'a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ( 'a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ( 'a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ( 'a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ( 'a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ( 'a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...), + ... + ('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...), + ('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...), + ('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...), + ('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...), + ('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...), + ('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...), + ('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...), + ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...), + ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...), + ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" + assert result == expected + + def test_multiindex_long_element(self): + # Non-regression test towards GH#52960 + data = MultiIndex.from_tuples([("c" * 62,)]) + + expected = ( + "MultiIndex([('cccccccccccccccccccccccccccccccccccccccc" + "cccccccccccccccccccccc',)],\n )" + ) + assert str(data) == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_level_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_level_values.py new file mode 100644 index 0000000000000000000000000000000000000000..28c77e78924cbc35feed4ae838b81f6be38478b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_level_values.py @@ -0,0 +1,124 @@ +import numpy as np + +import pandas as pd +from pandas import ( + CategoricalIndex, + Index, + MultiIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestGetLevelValues: + def test_get_level_values_box_datetime64(self): + dates = date_range("1/1/2000", periods=4) + levels = [dates, [0, 1]] + codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] + + index = MultiIndex(levels=levels, codes=codes) + + assert isinstance(index.get_level_values(0)[0], Timestamp) + + +def test_get_level_values(idx): + result = idx.get_level_values(0) + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") + tm.assert_index_equal(result, expected) + assert result.name == "first" + + result = idx.get_level_values("first") + expected = idx.get_level_values(0) + tm.assert_index_equal(result, expected) + + # GH 10460 + index = MultiIndex( + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) + + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) + tm.assert_index_equal(index.get_level_values(0), exp) + exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) + tm.assert_index_equal(index.get_level_values(1), exp) + + +def test_get_level_values_all_na(): + # GH#17924 when level entirely consists of nan + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = Index([np.nan, np.nan, np.nan], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = Index(["a", np.nan, 1], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_int_with_na(): + # GH#17924 + arrays = [["a", "b", "b"], [1, np.nan, 2]] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([1, np.nan, 2]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([np.nan, np.nan, 2]) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_na(): + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = Index([np.nan, np.nan, np.nan]) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = Index(["a", np.nan, 1]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = pd.DatetimeIndex([0, 1, pd.NaT]) + tm.assert_index_equal(result, expected) + + arrays = [[], []] + index = MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = Index([], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_when_periods(): + # GH33131. See also discussion in GH32669. + # This test can probably be removed when PeriodIndex._engine is removed. + from pandas import ( + Period, + PeriodIndex, + ) + + idx = MultiIndex.from_arrays( + [PeriodIndex([Period("2019Q1"), Period("2019Q2")], name="b")] + ) + idx2 = MultiIndex.from_arrays( + [idx._get_level_values(level) for level in range(idx.nlevels)] + ) + assert all(x.is_monotonic_increasing for x in idx2.levels) + + +def test_values_loses_freq_of_underlying_index(): + # GH#49054 + idx = pd.DatetimeIndex(date_range("20200101", periods=3, freq="BME")) + expected = idx.copy(deep=True) + idx2 = Index([1, 2, 3]) + midx = MultiIndex(levels=[idx, idx2], codes=[[0, 1, 2], [0, 1, 2]]) + midx.values + assert idx.freq is not None + tm.assert_index_equal(idx, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_set.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_set.py new file mode 100644 index 0000000000000000000000000000000000000000..6eeaeb6711d03c981fb190964de2ec5501380378 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_get_set.py @@ -0,0 +1,384 @@ +import numpy as np +import pytest + +from pandas.compat import PY311 + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import ( + CategoricalIndex, + MultiIndex, +) +import pandas._testing as tm + + +def assert_matching(actual, expected, check_dtype=False): + # avoid specifying internal representation + # as much as possible + assert len(actual) == len(expected) + for act, exp in zip(actual, expected): + act = np.asarray(act) + exp = np.asarray(exp) + tm.assert_numpy_array_equal(act, exp, check_dtype=check_dtype) + + +def test_get_level_number_integer(idx): + idx.names = [1, 0] + assert idx._get_level_number(1) == 0 + assert idx._get_level_number(0) == 1 + msg = "Too many levels: Index has only 2 levels, not 3" + with pytest.raises(IndexError, match=msg): + idx._get_level_number(2) + with pytest.raises(KeyError, match="Level fourth not found"): + idx._get_level_number("fourth") + + +def test_get_dtypes(using_infer_string): + # Test MultiIndex.dtypes (# Gh37062) + idx_multitype = MultiIndex.from_product( + [[1, 2, 3], ["a", "b", "c"], pd.date_range("20200101", periods=2, tz="UTC")], + names=["int", "string", "dt"], + ) + + exp = "object" if not using_infer_string else "string" + expected = pd.Series( + { + "int": np.dtype("int64"), + "string": exp, + "dt": DatetimeTZDtype(tz="utc"), + } + ) + tm.assert_series_equal(expected, idx_multitype.dtypes) + + +def test_get_dtypes_no_level_name(using_infer_string): + # Test MultiIndex.dtypes (# GH38580 ) + idx_multitype = MultiIndex.from_product( + [ + [1, 2, 3], + ["a", "b", "c"], + pd.date_range("20200101", periods=2, tz="UTC"), + ], + ) + exp = "object" if not using_infer_string else "string" + expected = pd.Series( + { + "level_0": np.dtype("int64"), + "level_1": exp, + "level_2": DatetimeTZDtype(tz="utc"), + } + ) + tm.assert_series_equal(expected, idx_multitype.dtypes) + + +def test_get_dtypes_duplicate_level_names(using_infer_string): + # Test MultiIndex.dtypes with non-unique level names (# GH45174) + result = MultiIndex.from_product( + [ + [1, 2, 3], + ["a", "b", "c"], + pd.date_range("20200101", periods=2, tz="UTC"), + ], + names=["A", "A", "A"], + ).dtypes + exp = "object" if not using_infer_string else "string" + expected = pd.Series( + [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], + index=["A", "A", "A"], + ) + tm.assert_series_equal(result, expected) + + +def test_get_level_number_out_of_bounds(multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + with pytest.raises(IndexError, match="Too many levels"): + frame.index._get_level_number(2) + with pytest.raises(IndexError, match="not a valid level number"): + frame.index._get_level_number(-3) + + +def test_set_name_methods(idx): + # so long as these are synonyms, we don't need to test set_names + index_names = ["first", "second"] + assert idx.rename == idx.set_names + new_names = [name + "SUFFIX" for name in index_names] + ind = idx.set_names(new_names) + assert idx.names == index_names + assert ind.names == new_names + msg = "Length of names must match number of levels in MultiIndex" + with pytest.raises(ValueError, match=msg): + ind.set_names(new_names + new_names) + new_names2 = [name + "SUFFIX2" for name in new_names] + res = ind.set_names(new_names2, inplace=True) + assert res is None + assert ind.names == new_names2 + + # set names for specific level (# GH7792) + ind = idx.set_names(new_names[0], level=0) + assert idx.names == index_names + assert ind.names == [new_names[0], index_names[1]] + + res = ind.set_names(new_names2[0], level=0, inplace=True) + assert res is None + assert ind.names == [new_names2[0], index_names[1]] + + # set names for multiple levels + ind = idx.set_names(new_names, level=[0, 1]) + assert idx.names == index_names + assert ind.names == new_names + + res = ind.set_names(new_names2, level=[0, 1], inplace=True) + assert res is None + assert ind.names == new_names2 + + +def test_set_levels_codes_directly(idx): + # setting levels/codes directly raises AttributeError + + levels = idx.levels + new_levels = [[lev + "a" for lev in level] for level in levels] + + codes = idx.codes + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] + + msg = "Can't set attribute" + with pytest.raises(AttributeError, match=msg): + idx.levels = new_levels + + msg = ( + "property 'codes' of 'MultiIndex' object has no setter" + if PY311 + else "can't set attribute" + ) + with pytest.raises(AttributeError, match=msg): + idx.codes = new_codes + + +def test_set_levels(idx): + # side note - you probably wouldn't want to use levels and codes + # directly like this - but it is possible. + levels = idx.levels + new_levels = [[lev + "a" for lev in level] for level in levels] + + # level changing [w/o mutation] + ind2 = idx.set_levels(new_levels) + assert_matching(ind2.levels, new_levels) + assert_matching(idx.levels, levels) + + # level changing specific level [w/o mutation] + ind2 = idx.set_levels(new_levels[0], level=0) + assert_matching(ind2.levels, [new_levels[0], levels[1]]) + assert_matching(idx.levels, levels) + + ind2 = idx.set_levels(new_levels[1], level=1) + assert_matching(ind2.levels, [levels[0], new_levels[1]]) + assert_matching(idx.levels, levels) + + # level changing multiple levels [w/o mutation] + ind2 = idx.set_levels(new_levels, level=[0, 1]) + assert_matching(ind2.levels, new_levels) + assert_matching(idx.levels, levels) + + # illegal level changing should not change levels + # GH 13754 + original_index = idx.copy() + with pytest.raises(ValueError, match="^On"): + idx.set_levels(["c"], level=0) + assert_matching(idx.levels, original_index.levels, check_dtype=True) + + with pytest.raises(ValueError, match="^On"): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0) + assert_matching(idx.codes, original_index.codes, check_dtype=True) + + with pytest.raises(TypeError, match="^Levels"): + idx.set_levels("c", level=0) + assert_matching(idx.levels, original_index.levels, check_dtype=True) + + with pytest.raises(TypeError, match="^Codes"): + idx.set_codes(1, level=0) + assert_matching(idx.codes, original_index.codes, check_dtype=True) + + +def test_set_codes(idx): + # side note - you probably wouldn't want to use levels and codes + # directly like this - but it is possible. + codes = idx.codes + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] + + # changing codes w/o mutation + ind2 = idx.set_codes(new_codes) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) + + # codes changing specific level w/o mutation + ind2 = idx.set_codes(new_codes[0], level=0) + assert_matching(ind2.codes, [new_codes[0], codes[1]]) + assert_matching(idx.codes, codes) + + ind2 = idx.set_codes(new_codes[1], level=1) + assert_matching(ind2.codes, [codes[0], new_codes[1]]) + assert_matching(idx.codes, codes) + + # codes changing multiple levels w/o mutation + ind2 = idx.set_codes(new_codes, level=[0, 1]) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) + + # label changing for levels of different magnitude of categories + ind = MultiIndex.from_tuples([(0, i) for i in range(130)]) + new_codes = range(129, -1, -1) + expected = MultiIndex.from_tuples([(0, i) for i in new_codes]) + + # [w/o mutation] + result = ind.set_codes(codes=new_codes, level=1) + assert result.equals(expected) + + +def test_set_levels_codes_names_bad_input(idx): + levels, codes = idx.levels, idx.codes + names = idx.names + + with pytest.raises(ValueError, match="Length of levels"): + idx.set_levels([levels[0]]) + + with pytest.raises(ValueError, match="Length of codes"): + idx.set_codes([codes[0]]) + + with pytest.raises(ValueError, match="Length of names"): + idx.set_names([names[0]]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_levels(levels[0]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_codes(codes[0]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list-like"): + idx.set_names(names[0]) + + # should have equal lengths + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_levels(levels[0], level=[0, 1]) + + with pytest.raises(TypeError, match="list-like"): + idx.set_levels(levels, level=0) + + # should have equal lengths + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_codes(codes[0], level=[0, 1]) + + with pytest.raises(TypeError, match="list-like"): + idx.set_codes(codes, level=0) + + # should have equal lengths + with pytest.raises(ValueError, match="Length of names"): + idx.set_names(names[0], level=[0, 1]) + + with pytest.raises(TypeError, match="Names must be a"): + idx.set_names(names, level=0) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_names_with_nlevel_1(inplace): + # GH 21149 + # Ensure that .set_names for MultiIndex with + # nlevels == 1 does not raise any errors + expected = MultiIndex(levels=[[0, 1]], codes=[[0, 1]], names=["first"]) + m = MultiIndex.from_product([[0, 1]]) + result = m.set_names("first", level=0, inplace=inplace) + + if inplace: + result = m + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_set_levels_categorical(ordered): + # GH13854 + index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) + + cidx = CategoricalIndex(list("bac"), ordered=ordered) + result = index.set_levels(cidx, level=0) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], codes=index.codes) + tm.assert_index_equal(result, expected) + + result_lvl = result.get_level_values(0) + expected_lvl = CategoricalIndex( + list("bacb"), categories=cidx.categories, ordered=cidx.ordered + ) + tm.assert_index_equal(result_lvl, expected_lvl) + + +def test_set_value_keeps_names(): + # motivating example from #3742 + lev1 = ["hans", "hans", "hans", "grethe", "grethe", "grethe"] + lev2 = ["1", "2", "3"] * 2 + idx = MultiIndex.from_arrays([lev1, lev2], names=["Name", "Number"]) + df = pd.DataFrame( + np.random.default_rng(2).standard_normal((6, 4)), + columns=["one", "two", "three", "four"], + index=idx, + ) + df = df.sort_index() + assert df._is_copy is None + assert df.index.names == ("Name", "Number") + df.at[("grethe", "4"), "one"] = 99.34 + assert df._is_copy is None + assert df.index.names == ("Name", "Number") + + +def test_set_levels_with_iterable(): + # GH23273 + sizes = [1, 2, 3] + colors = ["black"] * 3 + index = MultiIndex.from_arrays([sizes, colors], names=["size", "color"]) + + result = index.set_levels(map(int, ["3", "2", "1"]), level="size") + + expected_sizes = [3, 2, 1] + expected = MultiIndex.from_arrays([expected_sizes, colors], names=["size", "color"]) + tm.assert_index_equal(result, expected) + + +def test_set_empty_level(): + # GH#48636 + midx = MultiIndex.from_arrays([[]], names=["A"]) + result = midx.set_levels(pd.DatetimeIndex([]), level=0) + expected = MultiIndex.from_arrays([pd.DatetimeIndex([])], names=["A"]) + tm.assert_index_equal(result, expected) + + +def test_set_levels_pos_args_removal(): + # https://github.com/pandas-dev/pandas/issues/41485 + idx = MultiIndex.from_tuples( + [ + (1, "one"), + (3, "one"), + ], + names=["foo", "bar"], + ) + with pytest.raises(TypeError, match="positional arguments"): + idx.set_levels(["a", "b", "c"], 0) + + with pytest.raises(TypeError, match="positional arguments"): + idx.set_codes([[0, 1], [1, 0]], 0) + + +def test_set_levels_categorical_keep_dtype(): + # GH#52125 + midx = MultiIndex.from_arrays([[5, 6]]) + result = midx.set_levels(levels=pd.Categorical([1, 2]), level=0) + expected = MultiIndex.from_arrays([pd.Categorical([1, 2])]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..5e2d3c23da6452a4155af2674b7ce4a6dd7d2680 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_indexing.py @@ -0,0 +1,1001 @@ +from datetime import timedelta +import re + +import numpy as np +import pytest + +from pandas._libs import index as libindex +from pandas.errors import ( + InvalidIndexError, + PerformanceWarning, +) + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + date_range, +) +import pandas._testing as tm + + +class TestSliceLocs: + def test_slice_locs_partial(self, idx): + sorted_idx, _ = idx.sortlevel(0) + + result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one")) + assert result == (1, 5) + + result = sorted_idx.slice_locs(None, ("qux", "one")) + assert result == (0, 5) + + result = sorted_idx.slice_locs(("foo", "two"), None) + assert result == (1, len(sorted_idx)) + + result = sorted_idx.slice_locs("bar", "baz") + assert result == (2, 4) + + def test_slice_locs(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) + stacked = df.stack(future_stack=True) + idx = stacked.index + + slob = slice(*idx.slice_locs(df.index[5], df.index[15])) + sliced = stacked[slob] + expected = df[5:16].stack(future_stack=True) + tm.assert_almost_equal(sliced.values, expected.values) + + slob = slice( + *idx.slice_locs( + df.index[5] + timedelta(seconds=30), + df.index[15] - timedelta(seconds=30), + ) + ) + sliced = stacked[slob] + expected = df[6:15].stack(future_stack=True) + tm.assert_almost_equal(sliced.values, expected.values) + + def test_slice_locs_with_type_mismatch(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + stacked = df.stack(future_stack=True) + idx = stacked.index + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs((1, 3)) + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) + df = DataFrame( + np.ones((5, 5)), + index=Index([f"i-{i}" for i in range(5)], name="a"), + columns=Index([f"i-{i}" for i in range(5)], name="a"), + ) + stacked = df.stack(future_stack=True) + idx = stacked.index + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(timedelta(seconds=30)) + # TODO: Try creating a UnicodeDecodeError in exception message + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(df.index[1], (16, "a")) + + def test_slice_locs_not_sorted(self): + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + msg = "[Kk]ey length.*greater than MultiIndex lexsort depth" + with pytest.raises(KeyError, match=msg): + index.slice_locs((1, 0, 1), (2, 1, 0)) + + # works + sorted_index, _ = index.sortlevel(0) + # should there be a test case here??? + sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) + + def test_slice_locs_not_contained(self): + # some searchsorted action + + index = MultiIndex( + levels=[[0, 2, 4, 6], [0, 2, 4]], + codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], + ) + + result = index.slice_locs((1, 0), (5, 2)) + assert result == (3, 6) + + result = index.slice_locs(1, 5) + assert result == (3, 6) + + result = index.slice_locs((2, 2), (5, 2)) + assert result == (3, 6) + + result = index.slice_locs(2, 5) + assert result == (3, 6) + + result = index.slice_locs((1, 0), (6, 3)) + assert result == (3, 8) + + result = index.slice_locs(-1, 10) + assert result == (0, len(index)) + + @pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], + ) + def test_slice_locs_with_missing_value( + self, index_arr, expected, start_idx, end_idx + ): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected + + +class TestPutmask: + def test_putmask_with_wrong_mask(self, idx): + # GH18368 + + msg = "putmask: mask and data must be the same size" + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) + 1, np.bool_), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) - 1, np.bool_), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask("foo", 1) + + def test_putmask_multiindex_other(self): + # GH#43212 `value` is also a MultiIndex + + left = MultiIndex.from_tuples([(np.nan, 6), (np.nan, 6), ("a", 4)]) + right = MultiIndex.from_tuples([("a", 1), ("a", 1), ("d", 1)]) + mask = np.array([True, True, False]) + + result = left.putmask(mask, right) + + expected = MultiIndex.from_tuples([right[0], right[1], left[2]]) + tm.assert_index_equal(result, expected) + + def test_putmask_keep_dtype(self, any_numeric_ea_dtype): + # GH#49830 + midx = MultiIndex.from_arrays( + [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]] + ) + midx2 = MultiIndex.from_arrays( + [pd.Series([5, 6, 7], dtype=any_numeric_ea_dtype), [-1, -2, -3]] + ) + result = midx.putmask([True, False, False], midx2) + expected = MultiIndex.from_arrays( + [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]] + ) + tm.assert_index_equal(result, expected) + + def test_putmask_keep_dtype_shorter_value(self, any_numeric_ea_dtype): + # GH#49830 + midx = MultiIndex.from_arrays( + [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]] + ) + midx2 = MultiIndex.from_arrays( + [pd.Series([5], dtype=any_numeric_ea_dtype), [-1]] + ) + result = midx.putmask([True, False, False], midx2) + expected = MultiIndex.from_arrays( + [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]] + ) + tm.assert_index_equal(result, expected) + + +class TestGetIndexer: + def test_get_indexer(self): + major_axis = Index(np.arange(4)) + minor_axis = Index(np.arange(2)) + + major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) + minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + idx1 = index[:5] + idx2 = index[[1, 3, 5]] + + r1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) + + r1 = idx2.get_indexer(idx1, method="pad") + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method="pad") + tm.assert_almost_equal(r2, e1[::-1]) + + rffill1 = idx2.get_indexer(idx1, method="ffill") + tm.assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method="backfill") + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method="backfill") + tm.assert_almost_equal(r2, e1[::-1]) + + rbfill1 = idx2.get_indexer(idx1, method="bfill") + tm.assert_almost_equal(r1, rbfill1) + + # pass non-MultiIndex + r1 = idx1.get_indexer(idx2.values) + rexp1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, rexp1) + + r1 = idx1.get_indexer([1, 2, 3]) + assert (r1 == [-1, -1, -1]).all() + + # create index with duplicates + idx1 = Index(list(range(10)) + list(range(10))) + idx2 = Index(list(range(20))) + + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + idx1.get_indexer(idx2) + + def test_get_indexer_nearest(self): + midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = ( + "method='nearest' not implemented yet for MultiIndex; " + "see GitHub issue 9365" + ) + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="nearest") + msg = "tolerance not implemented yet for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="pad", tolerance=2) + + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="h")), + ] + ) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + + @pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], + ) + def test_get_indexer_with_missing_value(self, index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_methods(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # test getting an indexer for another index with different methods + # confirms that getting an indexer without a filling method, getting an + # indexer and backfilling, and getting an indexer and padding all behave + # correctly in the case where all of the target values fall in between + # several levels in the MultiIndex into which they are getting an indexer + # + # visually, the MultiIndexes used in this test are: + # mult_idx_1: + # 0: -1 0 + # 1: 2 + # 2: 3 + # 3: 4 + # 4: 0 0 + # 5: 2 + # 6: 3 + # 7: 4 + # 8: 1 0 + # 9: 2 + # 10: 3 + # 11: 4 + # + # mult_idx_2: + # 0: 0 1 + # 1: 3 + # 2: 4 + mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, 6, 7], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + # ensure the legacy "bfill" option functions identically to "backfill" + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + # ensure the legacy "ffill" option functions identically to "pad" + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + @pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill", "nearest"]) + def test_get_indexer_methods_raise_for_non_monotonic(self, method): + # 53452 + mi = MultiIndex.from_arrays([[0, 4, 2], [0, 4, 2]]) + if method == "nearest": + err = NotImplementedError + msg = "not implemented yet for MultiIndex" + else: + err = ValueError + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(err, match=msg): + mi.get_indexer([(1, 1)], method=method) + + def test_get_indexer_three_or_more_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests get_indexer() on MultiIndexes with 3+ levels + # visually, these are + # mult_idx_1: + # 0: 1 2 5 + # 1: 7 + # 2: 4 5 + # 3: 7 + # 4: 6 5 + # 5: 7 + # 6: 3 2 5 + # 7: 7 + # 8: 4 5 + # 9: 7 + # 10: 6 5 + # 11: 7 + # + # mult_idx_2: + # 0: 1 1 8 + # 1: 1 5 9 + # 2: 1 6 7 + # 3: 2 1 6 + # 4: 2 7 6 + # 5: 2 7 8 + # 6: 3 6 8 + mult_idx_1 = MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) + mult_idx_2 = MultiIndex.from_tuples( + [ + (1, 1, 8), + (1, 5, 9), + (1, 6, 7), + (2, 1, 6), + (2, 7, 7), + (2, 7, 8), + (3, 6, 8), + ] + ) + # sanity check + assert mult_idx_1.is_monotonic_increasing + assert mult_idx_1.is_unique + assert mult_idx_2.is_monotonic_increasing + assert mult_idx_2.is_unique + + # show the relationships between the two + assert mult_idx_2[0] < mult_idx_1[0] + assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4] + assert mult_idx_1[5] == mult_idx_2[2] + assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6] + assert mult_idx_1[-1] < mult_idx_2[6] + + indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype) + tm.assert_almost_equal(expected, indexer_no_fill) + + # test with backfilling + indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype) + tm.assert_almost_equal(expected, indexer_backfilled) + + # now, the same thing, but forward-filled (aka "padded") + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype) + tm.assert_almost_equal(expected, indexer_padded) + + # now, do the indexing in the other direction + assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1] + assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2] + assert mult_idx_2[2] == mult_idx_1[5] + assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6] + + indexer = mult_idx_2.get_indexer(mult_idx_1) + expected = np.array( + [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype + ) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") + expected = np.array( + [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype + ) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") + expected = np.array( + [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype + ) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_crossing_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests a corner case with get_indexer() with MultiIndexes where, when we + # need to "carry" across levels, proper tuple ordering is respected + # + # the MultiIndexes used in this test, visually, are: + # mult_idx_1: + # 0: 1 1 1 1 + # 1: 2 + # 2: 2 1 + # 3: 2 + # 4: 1 2 1 1 + # 5: 2 + # 6: 2 1 + # 7: 2 + # 8: 2 1 1 1 + # 9: 2 + # 10: 2 1 + # 11: 2 + # 12: 2 2 1 1 + # 13: 2 + # 14: 2 1 + # 15: 2 + # + # mult_idx_2: + # 0: 1 3 2 2 + # 1: 2 3 2 2 + mult_idx_1 = MultiIndex.from_product([[1, 2]] * 4) + mult_idx_2 = MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)]) + + # show the tuple orderings, which get_indexer() should respect + assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8] + assert mult_idx_1[-1] < mult_idx_2[1] + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([8, -1], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([7, 15], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_kwarg_validation(self): + # GH#41918 + mi = MultiIndex.from_product([range(3), ["A", "B"]]) + + msg = "limit argument only valid if doing pad, backfill or nearest" + with pytest.raises(ValueError, match=msg): + mi.get_indexer(mi[:-1], limit=4) + + msg = "tolerance argument only valid if doing pad, backfill or nearest" + with pytest.raises(ValueError, match=msg): + mi.get_indexer(mi[:-1], tolerance="piano") + + def test_get_indexer_nan(self): + # GH#37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]) + expected = np.array([-1, 1]) + result = idx2.get_indexer(idx1) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + result = idx1.get_indexer(idx2) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + +def test_getitem(idx): + # scalar + assert idx[2] == ("bar", "one") + + # slice + result = idx[2:5] + expected = idx[[2, 3, 4]] + assert result.equals(expected) + + # boolean + result = idx[[True, False, True, False, True, True]] + result2 = idx[np.array([True, False, True, False, True, True])] + expected = idx[[0, 2, 4, 5]] + assert result.equals(expected) + assert result2.equals(expected) + + +def test_getitem_group_select(idx): + sorted_idx, _ = idx.sortlevel(0) + assert sorted_idx.get_loc("baz") == slice(3, 4) + assert sorted_idx.get_loc("foo") == slice(0, 2) + + +@pytest.mark.parametrize("ind1", [[True] * 5, Index([True] * 5)]) +@pytest.mark.parametrize( + "ind2", + [[True, False, True, False, False], Index([True, False, True, False, False])], +) +def test_getitem_bool_index_all(ind1, ind2): + # GH#22533 + idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) + tm.assert_index_equal(idx[ind1], idx) + + expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) + tm.assert_index_equal(idx[ind2], expected) + + +@pytest.mark.parametrize("ind1", [[True], Index([True])]) +@pytest.mark.parametrize("ind2", [[False], Index([False])]) +def test_getitem_bool_index_single(ind1, ind2): + # GH#22533 + idx = MultiIndex.from_tuples([(10, 1)]) + tm.assert_index_equal(idx[ind1], idx) + + expected = MultiIndex( + levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], + codes=[[], []], + ) + tm.assert_index_equal(idx[ind2], expected) + + +class TestGetLoc: + def test_get_loc(self, idx): + assert idx.get_loc(("foo", "two")) == 1 + assert idx.get_loc(("baz", "two")) == 3 + with pytest.raises(KeyError, match=r"^\('bar', 'two'\)$"): + idx.get_loc(("bar", "two")) + with pytest.raises(KeyError, match=r"^'quux'$"): + idx.get_loc("quux") + + # 3 levels + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + with pytest.raises(KeyError, match=r"^\(1, 1\)$"): + index.get_loc((1, 1)) + assert index.get_loc((2, 0)) == slice(3, 5) + + def test_get_loc_duplicates(self): + index = Index([2, 2, 2, 2]) + result = index.get_loc(2) + expected = slice(0, 4) + assert result == expected + + index = Index(["c", "a", "a", "b", "b"]) + rs = index.get_loc("c") + xp = 0 + assert rs == xp + + with pytest.raises(KeyError, match="2"): + index.get_loc(2) + + def test_get_loc_level(self): + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + loc, new_index = index.get_loc_level((0, 1)) + expected = slice(1, 2) + exp_index = index[expected].droplevel(0).droplevel(0) + assert loc == expected + assert new_index.equals(exp_index) + + loc, new_index = index.get_loc_level((0, 1, 0)) + expected = 1 + assert loc == expected + assert new_index is None + + with pytest.raises(KeyError, match=r"^\(2, 2\)$"): + index.get_loc_level((2, 2)) + # GH 22221: unused label + with pytest.raises(KeyError, match=r"^2$"): + index.drop(2).get_loc_level(2) + # Unused label on unsorted level: + with pytest.raises(KeyError, match=r"^2$"): + index.drop(1, level=2).get_loc_level(2, level=2) + + index = MultiIndex( + levels=[[2000], list(range(4))], + codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])], + ) + result, new_index = index.get_loc_level((2000, slice(None, None))) + expected = slice(None, None) + assert result == expected + assert new_index.equals(index.droplevel(0)) + + @pytest.mark.parametrize("dtype1", [int, float, bool, str]) + @pytest.mark.parametrize("dtype2", [int, float, bool, str]) + def test_get_loc_multiple_dtypes(self, dtype1, dtype2): + # GH 18520 + levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)] + idx = MultiIndex.from_product(levels) + assert idx.get_loc(idx[2]) == 2 + + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("dtypes", [[int, float], [float, int]]) + def test_get_loc_implicit_cast(self, level, dtypes): + # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] + lev_dtype, key_dtype = dtypes + levels[level] = np.array([0, 1], dtype=lev_dtype) + key[level] = key_dtype(1) + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + @pytest.mark.parametrize("dtype", [bool, object]) + def test_get_loc_cast_bool(self, dtype): + # GH 19086 : int is casted to bool, but not vice-versa (for object dtype) + # With bool dtype, we don't cast in either direction. + levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")] + idx = MultiIndex.from_product(levels) + + if dtype is bool: + with pytest.raises(KeyError, match=r"^\(0, 1\)$"): + assert idx.get_loc((0, 1)) == 1 + with pytest.raises(KeyError, match=r"^\(1, 0\)$"): + assert idx.get_loc((1, 0)) == 2 + else: + # We use python object comparisons, which treat 0 == False and 1 == True + assert idx.get_loc((0, 1)) == 1 + assert idx.get_loc((1, 0)) == 2 + + with pytest.raises(KeyError, match=r"^\(False, True\)$"): + idx.get_loc((False, True)) + with pytest.raises(KeyError, match=r"^\(True, False\)$"): + idx.get_loc((True, False)) + + @pytest.mark.parametrize("level", [0, 1]) + def test_get_loc_nan(self, level, nulls_fixture): + # GH 18485 : NaN in MultiIndex + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] + levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) + key[level] = nulls_fixture + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + def test_get_loc_missing_nan(self): + # GH 8569 + idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) + assert isinstance(idx.get_loc(1), slice) + with pytest.raises(KeyError, match=r"^3$"): + idx.get_loc(3) + with pytest.raises(KeyError, match=r"^nan$"): + idx.get_loc(np.nan) + with pytest.raises(InvalidIndexError, match=r"\[nan\]"): + # listlike/non-hashable raises TypeError + idx.get_loc([np.nan]) + + def test_get_loc_with_values_including_missing_values(self): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + def test_get_loc_duplicates2(self): + # TODO: de-duplicate with test_get_loc_duplicates above? + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + + assert index.get_loc("D") == slice(0, 3) + + def test_get_loc_past_lexsort_depth(self): + # GH#30053 + idx = MultiIndex( + levels=[["a"], [0, 7], [1]], + codes=[[0, 0], [1, 0], [0, 0]], + names=["x", "y", "z"], + sortorder=0, + ) + key = ("a", 7) + + with tm.assert_produces_warning(PerformanceWarning): + # PerformanceWarning: indexing past lexsort depth may impact performance + result = idx.get_loc(key) + + assert result == slice(0, 1, None) + + def test_multiindex_get_loc_list_raises(self): + # GH#35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = r"\[\]" + with pytest.raises(InvalidIndexError, match=msg): + idx.get_loc([]) + + def test_get_loc_nested_tuple_raises_keyerror(self): + # raise KeyError, not TypeError + mi = MultiIndex.from_product([range(3), range(4), range(5), range(6)]) + key = ((2, 3, 4), "foo") + + with pytest.raises(KeyError, match=re.escape(str(key))): + mi.get_loc(key) + + +class TestWhere: + def test_where(self): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(True) + + def test_where_array_like(self, listlike_box): + mi = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + cond = [False, True] + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + mi.where(listlike_box(cond)) + + +class TestContains: + def test_contains_top_level(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex( + levels=[["C"], date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi + for val in mi.values: + assert val in mi + + def test_contains(self, idx): + assert ("foo", "two") in idx + assert ("bar", "two") not in idx + assert None not in idx + + def test_contains_with_missing_value(self): + # GH#19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + def test_multiindex_contains_dropped(self): + # GH#19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + def test_contains_td64_level(self): + # GH#24570 + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx + + def test_large_mi_contains(self, monkeypatch): + # GH#10645 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 10) + result = MultiIndex.from_arrays([range(10), range(10)]) + assert (10, 0) not in result + + +def test_timestamp_multiindex_indexer(): + # https://github.com/pandas-dev/pandas/issues/26944 + idx = MultiIndex.from_product( + [ + date_range("2019-01-01T00:15:33", periods=100, freq="h", name="date"), + ["x"], + [3], + ] + ) + df = DataFrame({"foo": np.arange(len(idx))}, idx) + result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"] + qidx = MultiIndex.from_product( + [ + date_range( + start="2019-01-02T00:15:33", + end="2019-01-05T03:15:33", + freq="h", + name="date", + ), + ["x"], + [3], + ] + ) + should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") + tm.assert_series_equal(result, should_be) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +def test_pyint_engine(): + # GH#18519 : when combinations of codes cannot be represented in 64 + # bits, the index underlying the MultiIndex engine works with Python + # integers, rather than uint64. + N = 5 + keys = [ + tuple(arr) + for arr in [ + [0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N, + ] + ] + # Each level contains 4 elements (including NaN), so it is represented + # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a + # 64 bit engine and truncating the first levels, the fourth and fifth + # keys would collide; if truncating the last levels, the fifth and + # sixth; if rotating bits rather than shifting, the third and fifth. + + for idx, key_value in enumerate(keys): + index = MultiIndex.from_tuples(keys) + assert index.get_loc(key_value) == idx + + expected = np.arange(idx + 1, dtype=np.intp) + result = index.get_indexer([keys[i] for i in expected]) + tm.assert_numpy_array_equal(result, expected) + + # With missing key: + idces = range(len(keys)) + expected = np.array([-1] + list(idces), dtype=np.intp) + missing = tuple([0, 1] * 5 * N) + result = index.get_indexer([missing] + [keys[i] for i in idces]) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "keys,expected", + [ + ((slice(None), [5, 4]), [1, 0]), + ((slice(None), [4, 5]), [0, 1]), + (([True, False, True], [4, 6]), [0, 2]), + (([True, False, True], [6, 4]), [0, 2]), + ((2, [4, 5]), [0, 1]), + ((2, [5, 4]), [1, 0]), + (([2], [4, 5]), [0, 1]), + (([2], [5, 4]), [1, 0]), + ], +) +def test_get_locs_reordering(keys, expected): + # GH48384 + idx = MultiIndex.from_arrays( + [ + [2, 2, 1], + [4, 5, 6], + ] + ) + result = idx.get_locs(keys) + expected = np.array(expected, dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + +def test_get_indexer_for_multiindex_with_nans(nulls_fixture): + # GH37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"]) + + result = idx2.get_indexer(idx1) + expected = np.array([-1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = idx1.get_indexer(idx2) + expected = np.array([-1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_integrity.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_integrity.py new file mode 100644 index 0000000000000000000000000000000000000000..d956747cbc859f40b69e52ea78c85ebce31f3427 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_integrity.py @@ -0,0 +1,289 @@ +import re + +import numpy as np +import pytest + +from pandas._libs import index as libindex + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + +import pandas as pd +from pandas import ( + Index, + IntervalIndex, + MultiIndex, + RangeIndex, +) +import pandas._testing as tm + + +def test_labels_dtypes(): + # GH 8456 + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + assert i.codes[0].dtype == "int8" + assert i.codes[1].dtype == "int8" + + i = MultiIndex.from_product([["a"], range(40)]) + assert i.codes[1].dtype == "int8" + i = MultiIndex.from_product([["a"], range(400)]) + assert i.codes[1].dtype == "int16" + i = MultiIndex.from_product([["a"], range(40000)]) + assert i.codes[1].dtype == "int32" + + i = MultiIndex.from_product([["a"], range(1000)]) + assert (i.codes[0] >= 0).all() + assert (i.codes[1] >= 0).all() + + +def test_values_boxed(): + tuples = [ + (1, pd.Timestamp("2000-01-01")), + (2, pd.NaT), + (3, pd.Timestamp("2000-01-03")), + (1, pd.Timestamp("2000-01-04")), + (2, pd.Timestamp("2000-01-02")), + (3, pd.Timestamp("2000-01-03")), + ] + result = MultiIndex.from_tuples(tuples) + expected = construct_1d_object_array_from_listlike(tuples) + tm.assert_numpy_array_equal(result.values, expected) + # Check that code branches for boxed values produce identical results + tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + + +def test_values_multiindex_datetimeindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + + aware = pd.DatetimeIndex(ints, tz="US/Central") + + idx = MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + +def test_values_multiindex_periodindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq="D") + + idx = MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = Index([x[0] for x in result]) + tm.assert_index_equal(outer, Index(ints, dtype=np.int64)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = Index([x[0] for x in result]) + tm.assert_index_equal(outer, Index(ints[:2], dtype=np.int64)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + + +def test_consistency(): + # need to construct an overflow + major_axis = list(range(70000)) + minor_axis = list(range(10)) + + major_codes = np.arange(70000) + minor_codes = np.repeat(range(10), 7000) + + # the fact that is works means it's consistent + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + + # inconsistent + major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + + assert index.is_unique is False + + +@pytest.mark.slow +def test_hash_collisions(monkeypatch): + # non-smoke test that we don't get hash collisions + size_cutoff = 50 + with monkeypatch.context() as m: + m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + index = MultiIndex.from_product( + [np.arange(8), np.arange(8)], names=["one", "two"] + ) + result = index.get_indexer(index.values) + tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + assert result == i + + +def test_dims(): + pass + + +def test_take_invalid_kwargs(): + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = MultiIndex.from_product(vals, names=["str", "dt"]) + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +def test_isna_behavior(idx): + # should not segfault GH5123 + # NOTE: if MI representation changes, may make sense to allow + # isna(MI) + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + pd.isna(idx) + + +def test_large_multiindex_error(monkeypatch): + # GH12527 + size_cutoff = 50 + with monkeypatch.context() as m: + m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + df_below_cutoff = pd.DataFrame( + 1, + index=MultiIndex.from_product([[1, 2], range(size_cutoff - 1)]), + columns=["dest"], + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_below_cutoff.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_below_cutoff.loc[(3, 0), "dest"] + df_above_cutoff = pd.DataFrame( + 1, + index=MultiIndex.from_product([[1, 2], range(size_cutoff + 1)]), + columns=["dest"], + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_above_cutoff.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_above_cutoff.loc[(3, 0), "dest"] + + +def test_mi_hashtable_populated_attribute_error(monkeypatch): + # GH 18165 + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 50) + r = range(50) + df = pd.DataFrame({"a": r, "b": r}, index=MultiIndex.from_arrays([r, r])) + + msg = "'Series' object has no attribute 'foo'" + with pytest.raises(AttributeError, match=msg): + df["a"].foo() + + +def test_can_hold_identifiers(idx): + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is True + + +def test_metadata_immutable(idx): + levels, codes = idx.levels, idx.codes + # shouldn't be able to set at either the top level or base level + mutable_regex = re.compile("does not support mutable operations") + with pytest.raises(TypeError, match=mutable_regex): + levels[0] = levels[0] + with pytest.raises(TypeError, match=mutable_regex): + levels[0][0] = levels[0][0] + # ditto for labels + with pytest.raises(TypeError, match=mutable_regex): + codes[0] = codes[0] + with pytest.raises(ValueError, match="assignment destination is read-only"): + codes[0][0] = codes[0][0] + # and for names + names = idx.names + with pytest.raises(TypeError, match=mutable_regex): + names[0] = names[0] + + +def test_level_setting_resets_attributes(): + ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + assert ind.is_monotonic_increasing + ind = ind.set_levels([["A", "B"], [1, 3, 2]]) + # if this fails, probably didn't reset the cache correctly. + assert not ind.is_monotonic_increasing + + +def test_rangeindex_fallback_coercion_bug(): + # GH 12893 + df1 = pd.DataFrame(np.arange(100).reshape((10, 10))) + df2 = pd.DataFrame(np.arange(100).reshape((10, 10))) + df = pd.concat( + {"df1": df1.stack(future_stack=True), "df2": df2.stack(future_stack=True)}, + axis=1, + ) + df.index.names = ["fizz", "buzz"] + + expected = pd.DataFrame( + {"df2": np.arange(100), "df1": np.arange(100)}, + index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]), + ) + tm.assert_frame_equal(df, expected, check_like=True) + + result = df.index.get_level_values("fizz") + expected = Index(np.arange(10, dtype=np.int64), name="fizz").repeat(10) + tm.assert_index_equal(result, expected) + + result = df.index.get_level_values("buzz") + expected = Index(np.tile(np.arange(10, dtype=np.int64), 10), name="buzz") + tm.assert_index_equal(result, expected) + + +def test_memory_usage(idx): + result = idx.memory_usage() + if len(idx): + idx.get_loc(idx[0]) + result2 = idx.memory_usage() + result3 = idx.memory_usage(deep=True) + + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(idx, (RangeIndex, IntervalIndex)): + assert result2 > result + + if idx.inferred_type == "object": + assert result3 > result2 + + else: + # we report 0 for no-length + assert result == 0 + + +def test_nlevels(idx): + assert idx.nlevels == 2 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_isin.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_isin.py new file mode 100644 index 0000000000000000000000000000000000000000..68fdf25359f1bbada24f6a2403d5a04331bee84c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_isin.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +from pandas import MultiIndex +import pandas._testing as tm + + +def test_isin_nan(): + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, True]) + ) + + +def test_isin_missing(nulls_fixture): + # GH48905 + mi1 = MultiIndex.from_tuples([(1, nulls_fixture)]) + mi2 = MultiIndex.from_tuples([(1, 1), (1, 2)]) + result = mi2.isin(mi1) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + +def test_isin(): + values = [("foo", 2), ("bar", 3), ("quux", 4)] + + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) + result = idx.isin(values) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + # empty, return dtype bool + idx = MultiIndex.from_arrays([[], []]) + result = idx.isin(values) + assert len(result) == 0 + assert result.dtype == np.bool_ + + +def test_isin_level_kwarg(): + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) + + vals_0 = ["foo", "bar", "quux"] + vals_1 = [2, 3, 10] + + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0)) + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2)) + + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1)) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1)) + + msg = "Too many levels: Index has only 2 levels, not 6" + with pytest.raises(IndexError, match=msg): + idx.isin(vals_0, level=5) + msg = "Too many levels: Index has only 2 levels, -5 is not a valid level number" + with pytest.raises(IndexError, match=msg): + idx.isin(vals_0, level=-5) + + with pytest.raises(KeyError, match=r"'Level 1\.0 not found'"): + idx.isin(vals_0, level=1.0) + with pytest.raises(KeyError, match=r"'Level -1\.0 not found'"): + idx.isin(vals_1, level=-1.0) + with pytest.raises(KeyError, match="'Level A not found'"): + idx.isin(vals_1, level="A") + + idx.names = ["A", "B"] + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level="A")) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level="B")) + + with pytest.raises(KeyError, match="'Level C not found'"): + idx.isin(vals_1, level="C") + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + result = midx.isin(labels, level=level) + tm.assert_numpy_array_equal(result, expected) + + +def test_isin_empty(): + # GH#51599 + midx = MultiIndex.from_arrays([[1, 2], [3, 4]]) + result = midx.isin([]) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + +def test_isin_generator(): + # GH#52568 + midx = MultiIndex.from_tuples([(1, 2)]) + result = midx.isin(x for x in [(1, 2)]) + expected = np.array([True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..edd0feaaa1159ff8340af772d27f2a7af09ceb87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_join.py @@ -0,0 +1,268 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Interval, + MultiIndex, + Series, + StringDtype, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])] +) +def test_join_level(idx, other, join_type): + join_index, lidx, ridx = other.join( + idx, how=join_type, level="second", return_indexers=True + ) + + exp_level = other.join(idx.levels[1], how=join_type) + assert join_index.levels[0].equals(idx.levels[0]) + assert join_index.levels[1].equals(exp_level) + + # pare down levels + mask = np.array([x[1] in exp_level for x in idx], dtype=bool) + exp_values = idx.values[mask] + tm.assert_numpy_array_equal(join_index.values, exp_values) + + if join_type in ("outer", "inner"): + join_index2, ridx2, lidx2 = idx.join( + other, how=join_type, level="second", return_indexers=True + ) + + assert join_index.equals(join_index2) + tm.assert_numpy_array_equal(lidx, lidx2) + tm.assert_numpy_array_equal(ridx, ridx2) + tm.assert_numpy_array_equal(join_index2.values, exp_values) + + +def test_join_level_corner_case(idx): + # some corner cases + index = Index(["three", "one", "two"]) + result = index.join(idx, level="second") + assert isinstance(result, MultiIndex) + + with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"): + idx.join(idx, level=1) + + +def test_join_self(idx, join_type): + result = idx.join(idx, how=join_type) + expected = idx + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + +def test_join_multi(): + # GH 10665 + midx = MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"]) + idx = Index([1, 2, 5], name="b") + + # inner + jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True) + exp_idx = MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"]) + exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) + tm.assert_index_equal(jidx, exp_idx) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True) + tm.assert_index_equal(jidx, exp_idx) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + # keep MultiIndex + jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True) + exp_ridx = np.array( + [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp + ) + tm.assert_index_equal(jidx, midx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True) + tm.assert_index_equal(jidx, midx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) + + +def test_join_multi_wrong_order(): + # GH 25760 + # GH 28956 + + midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"]) + + join_idx, lidx, ridx = midx1.join(midx2, return_indexers=True) + + exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(midx1, join_idx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) + + +def test_join_multi_return_indexers(): + # GH 34074 + + midx1 = MultiIndex.from_product([[1, 2], [3, 4], [5, 6]], names=["a", "b", "c"]) + midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + + result = midx1.join(midx2, return_indexers=False) + tm.assert_index_equal(result, midx1) + + +def test_join_overlapping_interval_level(): + # GH 44096 + idx_1 = MultiIndex.from_tuples( + [ + (1, Interval(0.0, 1.0)), + (1, Interval(1.0, 2.0)), + (1, Interval(2.0, 5.0)), + (2, Interval(0.0, 1.0)), + (2, Interval(1.0, 3.0)), # interval limit is here at 3.0, not at 2.0 + (2, Interval(3.0, 5.0)), + ], + names=["num", "interval"], + ) + + idx_2 = MultiIndex.from_tuples( + [ + (1, Interval(2.0, 5.0)), + (1, Interval(0.0, 1.0)), + (1, Interval(1.0, 2.0)), + (2, Interval(3.0, 5.0)), + (2, Interval(0.0, 1.0)), + (2, Interval(1.0, 3.0)), + ], + names=["num", "interval"], + ) + + expected = MultiIndex.from_tuples( + [ + (1, Interval(0.0, 1.0)), + (1, Interval(1.0, 2.0)), + (1, Interval(2.0, 5.0)), + (2, Interval(0.0, 1.0)), + (2, Interval(1.0, 3.0)), + (2, Interval(3.0, 5.0)), + ], + names=["num", "interval"], + ) + result = idx_1.join(idx_2, how="outer") + + tm.assert_index_equal(result, expected) + + +def test_join_midx_ea(): + # GH#49277 + midx = MultiIndex.from_arrays( + [Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")], + names=["a", "b"], + ) + midx2 = MultiIndex.from_arrays( + [Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"] + ) + result = midx.join(midx2, how="inner") + expected = MultiIndex.from_arrays( + [ + Series([1, 1], dtype="Int64"), + Series([1, 2], dtype="Int64"), + Series([3, 3], dtype="Int64"), + ], + names=["a", "b", "c"], + ) + tm.assert_index_equal(result, expected) + + +def test_join_midx_string(): + # GH#49277 + midx = MultiIndex.from_arrays( + [ + Series(["a", "a", "c"], dtype=StringDtype()), + Series(["a", "b", "c"], dtype=StringDtype()), + ], + names=["a", "b"], + ) + midx2 = MultiIndex.from_arrays( + [Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())], + names=["a", "c"], + ) + result = midx.join(midx2, how="inner") + expected = MultiIndex.from_arrays( + [ + Series(["a", "a"], dtype=StringDtype()), + Series(["a", "b"], dtype=StringDtype()), + Series(["c", "c"], dtype=StringDtype()), + ], + names=["a", "b", "c"], + ) + tm.assert_index_equal(result, expected) + + +def test_join_multi_with_nan(): + # GH29252 + df1 = DataFrame( + data={"col1": [1.1, 1.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + df2 = DataFrame( + data={"col2": [2.1, 2.2]}, + index=MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]), + ) + result = df1.join(df2) + expected = DataFrame( + data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", [0, 5]) +def test_join_dtypes(any_numeric_ea_dtype, val): + # GH#49830 + midx = MultiIndex.from_arrays([Series([1, 2], dtype=any_numeric_ea_dtype), [3, 4]]) + midx2 = MultiIndex.from_arrays( + [Series([1, val, val], dtype=any_numeric_ea_dtype), [3, 4, 4]] + ) + result = midx.join(midx2, how="outer") + expected = MultiIndex.from_arrays( + [Series([val, val, 1, 2], dtype=any_numeric_ea_dtype), [4, 4, 3, 4]] + ).sort_values() + tm.assert_index_equal(result, expected) + + +def test_join_dtypes_all_nan(any_numeric_ea_dtype): + # GH#49830 + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [np.nan, np.nan]] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 0, 0], dtype=any_numeric_ea_dtype), [np.nan, np.nan, np.nan]] + ) + result = midx.join(midx2, how="outer") + expected = MultiIndex.from_arrays( + [ + Series([0, 0, 1, 2], dtype=any_numeric_ea_dtype), + [np.nan, np.nan, np.nan, np.nan], + ] + ) + tm.assert_index_equal(result, expected) + + +def test_join_index_levels(): + # GH#53093 + midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) + midx2 = MultiIndex.from_tuples([("a", "2019-01-31")]) + result = midx.join(midx2, how="outer") + expected = MultiIndex.from_tuples( + [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")] + ) + tm.assert_index_equal(result.levels[1], expected.levels[1]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_lexsort.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_lexsort.py new file mode 100644 index 0000000000000000000000000000000000000000..fc16a4197a3a4daf65de6f58d85d13883d535d41 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_lexsort.py @@ -0,0 +1,46 @@ +from pandas import MultiIndex + + +class TestIsLexsorted: + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + ) + assert index._is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] + ) + assert not index._is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] + ) + assert not index._is_lexsorted() + assert index._lexsort_depth == 0 + + +class TestLexsortDepth: + def test_lexsort_depth(self): + # Test that lexsort_depth return the correct sortorder + # when it was given to the MultiIndex const. + # GH#28518 + + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + assert index._lexsort_depth == 2 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1 + ) + assert index._lexsort_depth == 1 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0 + ) + assert index._lexsort_depth == 0 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_missing.py new file mode 100644 index 0000000000000000000000000000000000000000..14ffc42fb4b59074c3c830a83ff6bdc36bdf099e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_missing.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def test_fillna(idx): + # GH 11343 + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) + + +def test_dropna(): + # GH 6194 + idx = MultiIndex.from_arrays( + [ + [1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ["a", "b", "c", np.nan, "e"], + ] + ) + + exp = MultiIndex.from_arrays([[1, 5], [1, 5], ["a", "e"]]) + tm.assert_index_equal(idx.dropna(), exp) + tm.assert_index_equal(idx.dropna(how="any"), exp) + + exp = MultiIndex.from_arrays( + [[1, np.nan, 3, 5], [1, 2, np.nan, 5], ["a", "b", "c", "e"]] + ) + tm.assert_index_equal(idx.dropna(how="all"), exp) + + msg = "invalid how option: xxx" + with pytest.raises(ValueError, match=msg): + idx.dropna(how="xxx") + + # GH26408 + # test if missing values are dropped for multiindex constructed + # from codes and values + idx = MultiIndex( + levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], [0, -1, 3, 3, 3, 4]], + ) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how="any"), expected) + + expected = MultiIndex.from_arrays( + [[np.nan, np.nan, "128", 2], ["128", "128", "128", 2]] + ) + tm.assert_index_equal(idx.dropna(how="all"), expected) + + +def test_nulls(idx): + # this is really a smoke test for the methods + # as these are adequately tested for function elsewhere + + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.isna() + + +@pytest.mark.xfail(reason="isna is not defined for MultiIndex") +def test_hasnans_isnans(idx): + # GH 11343, added tests for hasnans / isnans + index = idx.copy() + + # cases in indices doesn't include NaN + expected = np.array([False] * len(index), dtype=bool) + tm.assert_numpy_array_equal(index._isnan, expected) + assert index.hasnans is False + + index = idx.copy() + values = index.values + values[1] = np.nan + + index = type(idx)(values) + + expected = np.array([False] * len(index), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(index._isnan, expected) + assert index.hasnans is True + + +def test_nan_stays_float(): + # GH 7031 + idx0 = MultiIndex(levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1]) + idx1 = MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1]) + idxm = idx0.join(idx1, how="outer") + assert pd.isna(idx0.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isna(idxm.get_level_values(1)[:-1]).all() + + df0 = pd.DataFrame([[1, 2]], index=idx0) + df1 = pd.DataFrame([[3, 4]], index=idx1) + dfm = df0 - df1 + assert pd.isna(df0.index.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() + + +def test_tuples_have_na(): + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) + + assert pd.isna(index[4][0]) + assert pd.isna(index.values[4][0]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_monotonic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_monotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..2b0b3f7cb36d72abedc538eda9e6a85eb45067e2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_monotonic.py @@ -0,0 +1,188 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + MultiIndex, +) + + +def test_is_monotonic_increasing_lexsorted(lexsorted_two_level_string_multiindex): + # string ordering + mi = lexsorted_two_level_string_multiindex + assert mi.is_monotonic_increasing is False + assert Index(mi.values).is_monotonic_increasing is False + assert mi._is_strictly_monotonic_increasing is False + assert Index(mi.values)._is_strictly_monotonic_increasing is False + + +def test_is_monotonic_increasing(): + i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=["one", "two"]) + assert i.is_monotonic_increasing is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values).is_monotonic_increasing is True + assert i._is_strictly_monotonic_increasing is True + + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) + assert i.is_monotonic_increasing is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic_increasing is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) + assert i.is_monotonic_increasing is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic_increasing is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ["a", "b", "c"]]) + assert i.is_monotonic_increasing is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic_increasing is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex( + levels=[["bar", "baz", "foo", "qux"], ["mom", "next", "zenith"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic_increasing is True + assert Index(i.values).is_monotonic_increasing is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values)._is_strictly_monotonic_increasing is True + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[ + [1, 2, 3, 4], + [ + "gb00b03mlx29", + "lu0197800237", + "nl0000289783", + "nl0000289965", + "nl0000301109", + ], + ], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=["household_id", "asset_id"], + ) + + assert i.is_monotonic_increasing is False + assert i._is_strictly_monotonic_increasing is False + + # empty + i = MultiIndex.from_arrays([[], []]) + assert i.is_monotonic_increasing is True + assert Index(i.values).is_monotonic_increasing is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values)._is_strictly_monotonic_increasing is True + + +def test_is_monotonic_decreasing(): + i = MultiIndex.from_product( + [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + # string ordering + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values)._is_strictly_monotonic_decreasing is True + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[ + [4, 3, 2, 1], + [ + "nl0000301109", + "nl0000289965", + "nl0000289783", + "lu0197800237", + "gb00b03mlx29", + ], + ], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=["household_id", "asset_id"], + ) + + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + + # empty + i = MultiIndex.from_arrays([[], []]) + assert i.is_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values)._is_strictly_monotonic_decreasing is True + + +def test_is_strictly_monotonic_increasing(): + idx = MultiIndex( + levels=[["bar", "baz"], ["mom", "next"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) + assert idx.is_monotonic_increasing is True + assert idx._is_strictly_monotonic_increasing is False + + +def test_is_strictly_monotonic_decreasing(): + idx = MultiIndex( + levels=[["baz", "bar"], ["next", "mom"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) + assert idx.is_monotonic_decreasing is True + assert idx._is_strictly_monotonic_decreasing is False + + +@pytest.mark.parametrize("attr", ["is_monotonic_increasing", "is_monotonic_decreasing"]) +@pytest.mark.parametrize( + "values", + [[(np.nan,), (1,), (2,)], [(1,), (np.nan,), (2,)], [(1,), (2,), (np.nan,)]], +) +def test_is_monotonic_with_nans(values, attr): + # GH: 37220 + idx = MultiIndex.from_tuples(values, names=["test"]) + assert getattr(idx, attr) is False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_names.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_names.py new file mode 100644 index 0000000000000000000000000000000000000000..45f19b4d70fb95cb2aee459a54d2ad53790b7df8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_names.py @@ -0,0 +1,201 @@ +import pytest + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def check_level_names(index, names): + assert [level.name for level in index.levels] == list(names) + + +def test_slice_keep_name(): + x = MultiIndex.from_tuples([("a", "b"), (1, 2), ("c", "d")], names=["x", "y"]) + assert x[1:].names == x.names + + +def test_index_name_retained(): + # GH9857 + result = pd.DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}) + result = result.set_index("z") + result.loc[10] = [9, 10] + df_expected = pd.DataFrame( + {"x": [1, 2, 6, 9], "y": [2, 2, 8, 10], "z": [-5, 0, 5, 10]} + ) + df_expected = df_expected.set_index("z") + tm.assert_frame_equal(result, df_expected) + + +def test_changing_names(idx): + assert [level.name for level in idx.levels] == ["first", "second"] + + view = idx.view() + copy = idx.copy() + shallow_copy = idx._view() + + # changing names should not change level names on object + new_names = [name + "a" for name in idx.names] + idx.names = new_names + check_level_names(idx, ["firsta", "seconda"]) + + # and not on copies + check_level_names(view, ["first", "second"]) + check_level_names(copy, ["first", "second"]) + check_level_names(shallow_copy, ["first", "second"]) + + # and copies shouldn't change original + shallow_copy.names = [name + "c" for name in shallow_copy.names] + check_level_names(idx, ["firsta", "seconda"]) + + +def test_take_preserve_name(idx): + taken = idx.take([3, 0, 1]) + assert taken.names == idx.names + + +def test_copy_names(): + # Check that adding a "names" parameter to the copy is honored + # GH14302 + multi_idx = MultiIndex.from_tuples([(1, 2), (3, 4)], names=["MyName1", "MyName2"]) + multi_idx1 = multi_idx.copy() + + assert multi_idx.equals(multi_idx1) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx1.names == ["MyName1", "MyName2"] + + multi_idx2 = multi_idx.copy(names=["NewName1", "NewName2"]) + + assert multi_idx.equals(multi_idx2) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx2.names == ["NewName1", "NewName2"] + + multi_idx3 = multi_idx.copy(name=["NewName1", "NewName2"]) + + assert multi_idx.equals(multi_idx3) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx3.names == ["NewName1", "NewName2"] + + # gh-35592 + with pytest.raises(ValueError, match="Length of new names must be 2, got 1"): + multi_idx.copy(names=["mario"]) + + with pytest.raises(TypeError, match="MultiIndex.name must be a hashable type"): + multi_idx.copy(names=[["mario"], ["luigi"]]) + + +def test_names(idx): + # names are assigned in setup + assert idx.names == ["first", "second"] + level_names = [level.name for level in idx.levels] + assert level_names == idx.names + + # setting bad names on existing + index = idx + with pytest.raises(ValueError, match="^Length of names"): + setattr(index, "names", list(index.names) + ["third"]) + with pytest.raises(ValueError, match="^Length of names"): + setattr(index, "names", []) + + # initializing with bad names (should always be equivalent) + major_axis, minor_axis = idx.levels + major_codes, minor_codes = idx.codes + with pytest.raises(ValueError, match="^Length of names"): + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first"], + ) + with pytest.raises(ValueError, match="^Length of names"): + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first", "second", "third"], + ) + + # names are assigned on index, but not transferred to the levels + index.names = ["a", "b"] + level_names = [level.name for level in index.levels] + assert level_names == ["a", "b"] + + +def test_duplicate_level_names_access_raises(idx): + # GH19029 + idx.names = ["foo", "foo"] + with pytest.raises(ValueError, match="name foo occurs multiple times"): + idx._get_level_number("foo") + + +def test_get_names_from_levels(): + idx = MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + + assert idx.levels[0].name == "a" + assert idx.levels[1].name == "b" + + +def test_setting_names_from_levels_raises(): + idx = MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[0].name = "foo" + + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[1].name = "foo" + + new = pd.Series(1, index=idx.levels[0]) + with pytest.raises(RuntimeError, match="set_names"): + new.index.name = "bar" + + assert pd.Index._no_setting_name is False + assert pd.RangeIndex._no_setting_name is False + + +@pytest.mark.parametrize("func", ["rename", "set_names"]) +@pytest.mark.parametrize( + "rename_dict, exp_names", + [ + ({"x": "z"}, ["z", "y", "z"]), + ({"x": "z", "y": "x"}, ["z", "x", "z"]), + ({"y": "z"}, ["x", "z", "x"]), + ({}, ["x", "y", "x"]), + ({"z": "a"}, ["x", "y", "x"]), + ({"y": "z", "a": "b"}, ["x", "z", "x"]), + ], +) +def test_name_mi_with_dict_like_duplicate_names(func, rename_dict, exp_names): + # GH#20421 + mi = MultiIndex.from_arrays([[1, 2], [3, 4], [5, 6]], names=["x", "y", "x"]) + result = getattr(mi, func)(rename_dict) + expected = MultiIndex.from_arrays([[1, 2], [3, 4], [5, 6]], names=exp_names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("func", ["rename", "set_names"]) +@pytest.mark.parametrize( + "rename_dict, exp_names", + [ + ({"x": "z"}, ["z", "y"]), + ({"x": "z", "y": "x"}, ["z", "x"]), + ({"a": "z"}, ["x", "y"]), + ({}, ["x", "y"]), + ], +) +def test_name_mi_with_dict_like(func, rename_dict, exp_names): + # GH#20421 + mi = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["x", "y"]) + result = getattr(mi, func)(rename_dict) + expected = MultiIndex.from_arrays([[1, 2], [3, 4]], names=exp_names) + tm.assert_index_equal(result, expected) + + +def test_index_name_with_dict_like_raising(): + # GH#20421 + ix = pd.Index([1, 2]) + msg = "Can only pass dict-like as `names` for MultiIndex." + with pytest.raises(TypeError, match=msg): + ix.set_names({"x": "z"}) + + +def test_multiindex_name_and_level_raising(): + # GH#20421 + mi = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["x", "y"]) + with pytest.raises(TypeError, match="Can not pass level for dictlike `names`."): + mi.set_names(names={"x": "z"}, level={"x": "z"}) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..64cc1fa621b3195727cbfb3e62a8b6a6acf4dfaf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + IndexSlice, + MultiIndex, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + # c1 + # 2016-01-01 00:00:00 a 0 + # b 1 + # c 2 + # 2016-01-01 12:00:00 a 3 + # b 4 + # c 5 + # 2016-01-02 00:00:00 a 6 + # b 7 + # c 8 + # 2016-01-02 12:00:00 a 9 + # b 10 + # c 11 + # 2016-01-03 00:00:00 a 12 + # b 13 + # c 14 + dr = date_range("2016-01-01", "2016-01-03", freq="12h") + abc = ["a", "b", "c"] + mi = MultiIndex.from_product([dr, abc]) + frame = DataFrame({"c1": range(15)}, index=mi) + return frame + + +def test_partial_string_matching_single_index(df): + # partial string matching on a single index + for df_swap in [df.swaplevel(), df.swaplevel(0), df.swaplevel(0, 1)]: + df_swap = df_swap.sort_index() + just_a = df_swap.loc["a"] + result = just_a.loc["2016-01-01"] + expected = df.loc[IndexSlice[:, "a"], :].iloc[0:2] + expected.index = expected.index.droplevel(1) + tm.assert_frame_equal(result, expected) + + +def test_get_loc_partial_timestamp_multiindex(df): + mi = df.index + key = ("2016-01-01", "a") + loc = mi.get_loc(key) + + expected = np.zeros(len(mi), dtype=bool) + expected[[0, 3]] = True + tm.assert_numpy_array_equal(loc, expected) + + key2 = ("2016-01-02", "a") + loc2 = mi.get_loc(key2) + expected2 = np.zeros(len(mi), dtype=bool) + expected2[[6, 9]] = True + tm.assert_numpy_array_equal(loc2, expected2) + + key3 = ("2016-01", "a") + loc3 = mi.get_loc(key3) + expected3 = np.zeros(len(mi), dtype=bool) + expected3[mi.get_level_values(1).get_loc("a")] = True + tm.assert_numpy_array_equal(loc3, expected3) + + key4 = ("2016", "a") + loc4 = mi.get_loc(key4) + expected4 = expected3 + tm.assert_numpy_array_equal(loc4, expected4) + + # non-monotonic + taker = np.arange(len(mi), dtype=np.intp) + taker[::2] = taker[::-2] + mi2 = mi.take(taker) + loc5 = mi2.get_loc(key) + expected5 = np.zeros(len(mi2), dtype=bool) + expected5[[3, 14]] = True + tm.assert_numpy_array_equal(loc5, expected5) + + +def test_partial_string_timestamp_multiindex(df): + # GH10331 + df_swap = df.swaplevel(0, 1).sort_index() + SLC = IndexSlice + + # indexing with IndexSlice + result = df.loc[SLC["2016-01-01":"2016-02-01", :], :] + expected = df + tm.assert_frame_equal(result, expected) + + # match on secondary index + result = df_swap.loc[SLC[:, "2016-01-01":"2016-01-01"], :] + expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # partial string match on year only + result = df.loc["2016"] + expected = df + tm.assert_frame_equal(result, expected) + + # partial string match on date + result = df.loc["2016-01-01"] + expected = df.iloc[0:6] + tm.assert_frame_equal(result, expected) + + # partial string match on date and hour, from middle + result = df.loc["2016-01-02 12"] + # hourly resolution, same as index.levels[0], so we are _not_ slicing on + # that level, so that level gets dropped + expected = df.iloc[9:12].droplevel(0) + tm.assert_frame_equal(result, expected) + + # partial string match on secondary index + result = df_swap.loc[SLC[:, "2016-01-02"], :] + expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]] + tm.assert_frame_equal(result, expected) + + # tuple selector with partial string match on date + # "2016-01-01" has daily resolution, so _is_ a slice on the first level. + result = df.loc[("2016-01-01", "a"), :] + expected = df.iloc[[0, 3]] + expected = df.iloc[[0, 3]].droplevel(1) + tm.assert_frame_equal(result, expected) + + # Slicing date on first level should break (of course) bc the DTI is the + # second level on df_swap + with pytest.raises(KeyError, match="'2016-01-01'"): + df_swap.loc["2016-01-01"] + + +def test_partial_string_timestamp_multiindex_str_key_raises(df): + # Even though this syntax works on a single index, this is somewhat + # ambiguous and we don't want to extend this behavior forward to work + # in multi-indexes. This would amount to selecting a scalar from a + # column. + with pytest.raises(KeyError, match="'2016-01-01'"): + df["2016-01-01"] + + +def test_partial_string_timestamp_multiindex_daily_resolution(df): + # GH12685 (partial string with daily resolution or below) + result = df.loc[IndexSlice["2013-03":"2013-03", :], :] + expected = df.iloc[118:180] + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_pickle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8b72140442159fa0b8c608022d167bddd95db4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_pickle.py @@ -0,0 +1,10 @@ +import pytest + +from pandas import MultiIndex + + +def test_pickle_compat_construction(): + # this is testing for pickle compat + # need an object to create with + with pytest.raises(TypeError, match="Must pass both levels and codes"): + MultiIndex() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reindex.py new file mode 100644 index 0000000000000000000000000000000000000000..d1b4fe8b98760a0b776c5d81d471a7745e8407de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reindex.py @@ -0,0 +1,174 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + MultiIndex, +) +import pandas._testing as tm + + +def test_reindex(idx): + result, indexer = idx.reindex(list(idx[:4])) + assert isinstance(result, MultiIndex) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] + + result, indexer = idx.reindex(list(idx)) + assert isinstance(result, MultiIndex) + assert indexer is None + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] + + +def test_reindex_level(idx): + index = Index(["one"]) + + target, indexer = idx.reindex(index, level="second") + target2, indexer2 = index.reindex(idx, level="second") + + exp_index = idx.join(index, level="second", how="right") + exp_index2 = idx.join(index, level="second", how="left") + + assert target.equals(exp_index) + exp_indexer = np.array([0, 2, 4]) + tm.assert_numpy_array_equal(indexer, exp_indexer, check_dtype=False) + + assert target2.equals(exp_index2) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) + + with pytest.raises(TypeError, match="Fill method not supported"): + idx.reindex(idx, method="pad", level="second") + + +def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): + # GH6552 + idx = idx.copy() + target = idx.copy() + idx.names = target.names = [None, None] + + other_dtype = MultiIndex.from_product([[1, 2], [3, 4]]) + + # list & ndarray cases + assert idx.reindex([])[0].names == [None, None] + assert idx.reindex(np.array([]))[0].names == [None, None] + assert idx.reindex(target.tolist())[0].names == [None, None] + assert idx.reindex(target.values)[0].names == [None, None] + assert idx.reindex(other_dtype.tolist())[0].names == [None, None] + assert idx.reindex(other_dtype.values)[0].names == [None, None] + + idx.names = ["foo", "bar"] + assert idx.reindex([])[0].names == ["foo", "bar"] + assert idx.reindex(np.array([]))[0].names == ["foo", "bar"] + assert idx.reindex(target.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(target.values)[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.values)[0].names == ["foo", "bar"] + + +def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): + # GH7774 + idx = MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) + assert idx.reindex([], level=0)[0].names == ["foo", "bar"] + assert idx.reindex([], level=1)[0].names == ["foo", "bar"] + + +def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array( + using_infer_string, +): + # GH7774 + idx = MultiIndex.from_product([[0, 1], ["a", "b"]]) + assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 + exp = np.object_ if not using_infer_string else str + assert idx.reindex([], level=1)[0].levels[1].dtype.type == exp + + # case with EA levels + cat = pd.Categorical(["foo", "bar"]) + dti = pd.date_range("2016-01-01", periods=2, tz="US/Pacific") + mi = MultiIndex.from_product([cat, dti]) + assert mi.reindex([], level=0)[0].levels[0].dtype == cat.dtype + assert mi.reindex([], level=1)[0].levels[1].dtype == dti.dtype + + +def test_reindex_base(idx): + expected = np.arange(idx.size, dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + +def test_reindex_non_unique(): + idx = MultiIndex.from_tuples([(0, 0), (1, 1), (1, 1), (2, 2)]) + a = pd.Series(np.arange(4), index=idx) + new_idx = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + + msg = "cannot handle a non-unique multi-index!" + with pytest.raises(ValueError, match=msg): + a.reindex(new_idx) + + +@pytest.mark.parametrize("values", [[["a"], ["x"]], [[], []]]) +def test_reindex_empty_with_level(values): + # GH41170 + idx = MultiIndex.from_arrays(values) + result, result_indexer = idx.reindex(np.array(["b"]), level=0) + expected = MultiIndex(levels=[["b"], values[1]], codes=[[], []]) + expected_indexer = np.array([], dtype=result_indexer.dtype) + tm.assert_index_equal(result, expected) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_not_all_tuples(): + keys = [("i", "i"), ("i", "j"), ("j", "i"), "j"] + mi = MultiIndex.from_tuples(keys[:-1]) + idx = Index(keys) + res, indexer = mi.reindex(idx) + + tm.assert_index_equal(res, idx) + expected = np.array([0, 1, 2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + +def test_reindex_limit_arg_with_multiindex(): + # GH21247 + + idx = MultiIndex.from_tuples([(3, "A"), (4, "A"), (4, "B")]) + + df = pd.Series([0.02, 0.01, 0.012], index=idx) + + new_idx = MultiIndex.from_tuples( + [ + (3, "A"), + (3, "B"), + (4, "A"), + (4, "B"), + (4, "C"), + (5, "B"), + (5, "C"), + (6, "B"), + (6, "C"), + ] + ) + + with pytest.raises( + ValueError, + match="limit argument only valid if doing pad, backfill or nearest reindexing", + ): + df.reindex(new_idx, fill_value=0, limit=1) + + +def test_reindex_with_none_in_nested_multiindex(): + # GH42883 + index = MultiIndex.from_tuples([(("a", None), 1), (("b", None), 2)]) + index2 = MultiIndex.from_tuples([(("b", None), 2), (("a", None), 1)]) + df1_dtype = pd.DataFrame([1, 2], index=index) + df2_dtype = pd.DataFrame([2, 1], index=index2) + + result = df1_dtype.reindex_like(df2_dtype) + expected = df2_dtype + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reshape.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reshape.py new file mode 100644 index 0000000000000000000000000000000000000000..06dbb33aadf97a54e4bb283d3aed8fe1169164b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_reshape.py @@ -0,0 +1,224 @@ +from datetime import datetime + +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import ( + Index, + MultiIndex, +) +import pandas._testing as tm + + +def test_insert(idx): + # key contained in all levels + new_index = idx.insert(0, ("bar", "two")) + assert new_index.equal_levels(idx) + assert new_index[0] == ("bar", "two") + + # key not contained in all levels + new_index = idx.insert(0, ("abc", "three")) + + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") + tm.assert_index_equal(new_index.levels[0], exp0) + assert new_index.names == ["first", "second"] + + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") + tm.assert_index_equal(new_index.levels[1], exp1) + assert new_index[0] == ("abc", "three") + + # key wrong length + msg = "Item must have length equal to number of levels" + with pytest.raises(ValueError, match=msg): + idx.insert(0, ("foo2",)) + + left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"]) + left.set_index(["1st", "2nd"], inplace=True) + ts = left["3rd"].copy(deep=True) + + left.loc[("b", "x"), "3rd"] = 2 + left.loc[("b", "a"), "3rd"] = -1 + left.loc[("b", "b"), "3rd"] = 3 + left.loc[("a", "x"), "3rd"] = 4 + left.loc[("a", "w"), "3rd"] = 5 + left.loc[("a", "a"), "3rd"] = 6 + + ts.loc[("b", "x")] = 2 + ts.loc["b", "a"] = -1 + ts.loc[("b", "b")] = 3 + ts.loc["a", "x"] = 4 + ts.loc[("a", "w")] = 5 + ts.loc["a", "a"] = 6 + + right = pd.DataFrame( + [ + ["a", "b", 0], + ["b", "d", 1], + ["b", "x", 2], + ["b", "a", -1], + ["b", "b", 3], + ["a", "x", 4], + ["a", "w", 5], + ["a", "a", 6], + ], + columns=["1st", "2nd", "3rd"], + ) + right.set_index(["1st", "2nd"], inplace=True) + # FIXME data types changes to float because + # of intermediate nan insertion; + tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_series_equal(ts, right["3rd"]) + + +def test_insert2(): + # GH9250 + idx = ( + [("test1", i) for i in range(5)] + + [("test2", i) for i in range(6)] + + [("test", 17), ("test", 18)] + ) + + left = pd.Series(np.linspace(0, 10, 11), MultiIndex.from_tuples(idx[:-2])) + + left.loc[("test", 17)] = 11 + left.loc[("test", 18)] = 12 + + right = pd.Series(np.linspace(0, 12, 13), MultiIndex.from_tuples(idx)) + + tm.assert_series_equal(left, right) + + +def test_append(idx): + result = idx[:3].append(idx[3:]) + assert result.equals(idx) + + foos = [idx[:1], idx[1:3], idx[3:]] + result = foos[0].append(foos[1:]) + assert result.equals(idx) + + # empty + result = idx.append([]) + assert result.equals(idx) + + +def test_append_index(): + idx1 = Index([1.1, 1.2, 1.3]) + idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") + idx3 = Index(["A", "B", "C"]) + + midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) + midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) + + result = idx1.append(midx_lv2) + + # see gh-7112 + tz = pytz.timezone("Asia/Tokyo") + expected_tuples = [ + (1.1, tz.localize(datetime(2011, 1, 1))), + (1.2, tz.localize(datetime(2011, 1, 2))), + (1.3, tz.localize(datetime(2011, 1, 3))), + ] + expected = Index([1.1, 1.2, 1.3] + expected_tuples) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(idx1) + expected = Index(expected_tuples + [1.1, 1.2, 1.3]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv2) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv3) + tm.assert_index_equal(result, expected) + + result = midx_lv3.append(midx_lv2) + expected = Index._simple_new( + np.array( + [ + (1.1, tz.localize(datetime(2011, 1, 1)), "A"), + (1.2, tz.localize(datetime(2011, 1, 2)), "B"), + (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + ] + + expected_tuples, + dtype=object, + ), + None, + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("name, exp", [("b", "b"), ("c", None)]) +def test_append_names_match(name, exp): + # GH#48288 + midx = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = MultiIndex.from_arrays([[3], [5]], names=["a", name]) + result = midx.append(midx2) + expected = MultiIndex.from_arrays([[1, 2, 3], [3, 4, 5]], names=["a", exp]) + tm.assert_index_equal(result, expected) + + +def test_append_names_dont_match(): + # GH#48288 + midx = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = MultiIndex.from_arrays([[3], [5]], names=["x", "y"]) + result = midx.append(midx2) + expected = MultiIndex.from_arrays([[1, 2, 3], [3, 4, 5]], names=None) + tm.assert_index_equal(result, expected) + + +def test_append_overlapping_interval_levels(): + # GH 54934 + ivl1 = pd.IntervalIndex.from_breaks([0.0, 1.0, 2.0]) + ivl2 = pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5]) + mi1 = MultiIndex.from_product([ivl1, ivl1]) + mi2 = MultiIndex.from_product([ivl2, ivl2]) + result = mi1.append(mi2) + expected = MultiIndex.from_tuples( + [ + (pd.Interval(0.0, 1.0), pd.Interval(0.0, 1.0)), + (pd.Interval(0.0, 1.0), pd.Interval(1.0, 2.0)), + (pd.Interval(1.0, 2.0), pd.Interval(0.0, 1.0)), + (pd.Interval(1.0, 2.0), pd.Interval(1.0, 2.0)), + (pd.Interval(0.5, 1.5), pd.Interval(0.5, 1.5)), + (pd.Interval(0.5, 1.5), pd.Interval(1.5, 2.5)), + (pd.Interval(1.5, 2.5), pd.Interval(0.5, 1.5)), + (pd.Interval(1.5, 2.5), pd.Interval(1.5, 2.5)), + ] + ) + tm.assert_index_equal(result, expected) + + +def test_repeat(): + reps = 2 + numbers = [1, 2, 3] + names = np.array(["foo", "bar"]) + + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) + tm.assert_index_equal(m.repeat(reps), expected) + + +def test_insert_base(idx): + result = idx[1:4] + + # test 0th element + assert idx[0:4].equals(result.insert(0, idx[0])) + + +def test_delete_base(idx): + expected = idx[1:] + result = idx.delete(0) + assert result.equals(expected) + assert result.name == expected.name + + expected = idx[:-1] + result = idx.delete(-1) + assert result.equals(expected) + assert result.name == expected.name + + msg = "index 6 is out of bounds for axis 0 with size 6" + with pytest.raises(IndexError, match=msg): + idx.delete(len(idx)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..0abb56ecf9de7182dd38e45d5a8938cb81248034 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_setops.py @@ -0,0 +1,772 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + Series, +) +import pandas._testing as tm +from pandas.api.types import ( + is_float_dtype, + is_unsigned_integer_dtype, +) + + +@pytest.mark.parametrize("case", [0.5, "xxx"]) +@pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] +) +def test_set_ops_error_cases(idx, case, sort, method): + # non-iterable input + msg = "Input must be Index or array-like" + with pytest.raises(TypeError, match=msg): + getattr(idx, method)(case, sort=sort) + + +@pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) +def test_intersection_base(idx, sort, klass): + first = idx[2::-1] # first 3 elements reversed + second = idx[:5] + + if klass is not MultiIndex: + second = klass(second.values) + + intersect = first.intersection(second, sort=sort) + if sort is None: + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(intersect, expected) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.intersection([1, 2, 3], sort=sort) + + +@pytest.mark.arm_slow +@pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) +def test_union_base(idx, sort, klass): + first = idx[::-1] + second = idx[:5] + + if klass is not MultiIndex: + second = klass(second.values) + + union = first.union(second, sort=sort) + if sort is None: + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(union, expected) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.union([1, 2, 3], sort=sort) + + +def test_difference_base(idx, sort): + second = idx[4:] + answer = idx[:4] + result = idx.difference(second, sort=sort) + + if sort is None: + answer = answer.sort_values() + + assert result.equals(answer) + tm.assert_index_equal(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = idx.difference(case, sort=sort) + tm.assert_index_equal(result, answer) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + idx.difference([1, 2, 3], sort=sort) + + +def test_symmetric_difference(idx, sort): + first = idx[1:] + second = idx[:-1] + answer = idx[[-1, 0]] + result = first.symmetric_difference(second, sort=sort) + + if sort is None: + answer = answer.sort_values() + + tm.assert_index_equal(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.symmetric_difference(case, sort=sort) + tm.assert_index_equal(result, answer) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.symmetric_difference([1, 2, 3], sort=sort) + + +def test_multiindex_symmetric_difference(): + # GH 13490 + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) + result = idx.symmetric_difference(idx) + assert result.names == idx.names + + idx2 = idx.copy().rename(["A", "B"]) + result = idx.symmetric_difference(idx2) + assert result.names == [None, None] + + +def test_empty(idx): + # GH 15270 + assert not idx.empty + assert idx[:0].empty + + +def test_difference(idx, sort): + first = idx + result = first.difference(idx[-3:], sort=sort) + vals = idx[:-3].values + + if sort is None: + vals = sorted(vals) + + expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names) + + assert isinstance(result, MultiIndex) + assert result.equals(expected) + assert result.names == idx.names + tm.assert_index_equal(result, expected) + + # empty difference: reflexive + result = idx.difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # empty difference: superset + result = idx[-3:].difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # empty difference: degenerate + result = idx[:0].difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # names not the same + chunklet = idx[-3:] + chunklet.names = ["foo", "baz"] + result = first.difference(chunklet, sort=sort) + assert result.names == (None, None) + + # empty, but non-equal + result = idx.difference(idx.sortlevel(1)[0], sort=sort) + assert len(result) == 0 + + # raise Exception called with non-MultiIndex + result = first.difference(first.values, sort=sort) + assert result.equals(first[:0]) + + # name from empty array + result = first.difference([], sort=sort) + assert first.equals(result) + assert first.names == result.names + + # name from non-empty array + result = first.difference([("foo", "one")], sort=sort) + expected = MultiIndex.from_tuples( + [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")] + ) + expected.names = first.names + assert first.names == result.names + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.difference([1, 2, 3, 4, 5], sort=sort) + + +def test_difference_sort_special(): + # GH-24959 + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + # sort=None, the default + result = idx.difference([]) + tm.assert_index_equal(result, idx) + + +def test_difference_sort_special_true(): + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + result = idx.difference([], sort=True) + expected = MultiIndex.from_product([[0, 1], ["a", "b"]]) + tm.assert_index_equal(result, expected) + + +def test_difference_sort_incomparable(): + # GH-24959 + idx = MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + + other = MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) + # sort=None, the default + msg = "sort order is undefined for incomparable objects" + with tm.assert_produces_warning(RuntimeWarning, match=msg): + result = idx.difference(other) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.difference(other, sort=False) + tm.assert_index_equal(result, idx) + + +def test_difference_sort_incomparable_true(): + idx = MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + other = MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) + + # TODO: this is raising in constructing a Categorical when calling + # algos.safe_sort. Should we catch and re-raise with a better message? + msg = "'values' is not ordered, please explicitly specify the categories order " + with pytest.raises(TypeError, match=msg): + idx.difference(other, sort=True) + + +def test_union(idx, sort): + piece1 = idx[:5][::-1] + piece2 = idx[3:] + + the_union = piece1.union(piece2, sort=sort) + + if sort in (None, False): + tm.assert_index_equal(the_union.sort_values(), idx.sort_values()) + else: + tm.assert_index_equal(the_union, idx) + + # corner case, pass self or empty thing: + the_union = idx.union(idx, sort=sort) + tm.assert_index_equal(the_union, idx) + + the_union = idx.union(idx[:0], sort=sort) + tm.assert_index_equal(the_union, idx) + + tuples = idx.values + result = idx[:4].union(tuples[4:], sort=sort) + if sort is None: + tm.assert_index_equal(result.sort_values(), idx.sort_values()) + else: + assert result.equals(idx) + + +def test_union_with_regular_index(idx, using_infer_string): + other = Index(["A", "B", "C"]) + + result = other.union(idx) + assert ("foo", "one") in result + assert "B" in result + + if using_infer_string: + with pytest.raises(NotImplementedError, match="Can only union"): + idx.union(other) + else: + msg = "The values in the array are unorderable" + with tm.assert_produces_warning(RuntimeWarning, match=msg): + result2 = idx.union(other) + # This is more consistent now, if sorting fails then we don't sort at all + # in the MultiIndex case. + assert not result.equals(result2) + + +def test_intersection(idx, sort): + piece1 = idx[:5][::-1] + piece2 = idx[3:] + + the_int = piece1.intersection(piece2, sort=sort) + + if sort in (None, True): + tm.assert_index_equal(the_int, idx[3:5]) + else: + tm.assert_index_equal(the_int.sort_values(), idx[3:5]) + + # corner case, pass self + the_int = idx.intersection(idx, sort=sort) + tm.assert_index_equal(the_int, idx) + + # empty intersection: disjoint + empty = idx[:2].intersection(idx[2:], sort=sort) + expected = idx[:0] + assert empty.equals(expected) + + tuples = idx.values + result = idx.intersection(tuples) + assert result.equals(idx) + + +@pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] +) +def test_setop_with_categorical(idx, sort, method): + other = idx.to_flat_index().astype("category") + res_names = [None] * idx.nlevels + + result = getattr(idx, method)(other, sort=sort) + expected = getattr(idx, method)(idx, sort=sort).rename(res_names) + tm.assert_index_equal(result, expected) + + result = getattr(idx, method)(other[:5], sort=sort) + expected = getattr(idx, method)(idx[:5], sort=sort).rename(res_names) + tm.assert_index_equal(result, expected) + + +def test_intersection_non_object(idx, sort): + other = Index(range(3), name="foo") + + result = idx.intersection(other, sort=sort) + expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=None) + tm.assert_index_equal(result, expected, exact=True) + + # if we pass a length-0 ndarray (i.e. no name, we retain our idx.name) + result = idx.intersection(np.asarray(other)[:0], sort=sort) + expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=idx.names) + tm.assert_index_equal(result, expected, exact=True) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + # With non-zero length non-index, we try and fail to convert to tuples + idx.intersection(np.asarray(other), sort=sort) + + +def test_intersect_equal_sort(): + # GH-24959 + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=None), idx) + + +def test_intersect_equal_sort_true(): + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + expected = MultiIndex.from_product([[0, 1], ["a", "b"]]) + result = idx.intersection(idx, sort=True) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("slice_", [slice(None), slice(0)]) +def test_union_sort_other_empty(slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + +def test_union_sort_other_empty_sort(): + idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) + other = idx[:0] + result = idx.union(other, sort=True) + expected = MultiIndex.from_product([[0, 1], ["a", "b"]]) + tm.assert_index_equal(result, expected) + + +def test_union_sort_other_incomparable(): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + + # default, sort=None + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + +def test_union_sort_other_incomparable_sort(): + idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + msg = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=msg): + idx.union(idx[:1], sort=True) + + +def test_union_non_object_dtype_raises(): + # GH#32646 raise NotImplementedError instead of less-informative error + mi = MultiIndex.from_product([["a", "b"], [1, 2]]) + + idx = mi.levels[1] + + msg = "Can only union MultiIndex with MultiIndex or Index of tuples" + with pytest.raises(NotImplementedError, match=msg): + mi.union(idx) + + +def test_union_empty_self_different_names(): + # GH#38423 + mi = MultiIndex.from_arrays([[]]) + mi2 = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + result = mi.union(mi2) + expected = MultiIndex.from_arrays([[1, 2], [3, 4]]) + tm.assert_index_equal(result, expected) + + +def test_union_multiindex_empty_rangeindex(): + # GH#41234 + mi = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + ri = pd.RangeIndex(0) + + result_left = mi.union(ri) + tm.assert_index_equal(mi, result_left, check_names=False) + + result_right = ri.union(mi) + tm.assert_index_equal(mi, result_right, check_names=False) + + +@pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] +) +def test_setops_sort_validation(method): + idx1 = MultiIndex.from_product([["a", "b"], [1, 2]]) + idx2 = MultiIndex.from_product([["b", "c"], [1, 2]]) + + with pytest.raises(ValueError, match="The 'sort' keyword only takes"): + getattr(idx1, method)(idx2, sort=2) + + # sort=True is supported as of GH#? + getattr(idx1, method)(idx2, sort=True) + + +@pytest.mark.parametrize("val", [pd.NA, 100]) +def test_difference_keep_ea_dtypes(any_numeric_ea_dtype, val): + # GH#48606 + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 2, val], dtype=any_numeric_ea_dtype), [1, 1, 3]] + ) + result = midx.difference(midx2) + expected = MultiIndex.from_arrays([Series([1], dtype=any_numeric_ea_dtype), [2]]) + tm.assert_index_equal(result, expected) + + result = midx.difference(midx.sort_values(ascending=False)) + expected = MultiIndex.from_arrays( + [Series([], dtype=any_numeric_ea_dtype), Series([], dtype=np.int64)], + names=["a", None], + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("val", [pd.NA, 5]) +def test_symmetric_difference_keeping_ea_dtype(any_numeric_ea_dtype, val): + # GH#48607 + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 2, val], dtype=any_numeric_ea_dtype), [1, 1, 3]] + ) + result = midx.symmetric_difference(midx2) + expected = MultiIndex.from_arrays( + [Series([1, 1, val], dtype=any_numeric_ea_dtype), [1, 2, 3]] + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + ("tuples", "exp_tuples"), + [ + ([("val1", "test1")], [("val1", "test1")]), + ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]), + ( + [("val2", "test2"), ("val1", "test1")], + [("val2", "test2"), ("val1", "test1")], + ), + ], +) +def test_intersect_with_duplicates(tuples, exp_tuples): + # GH#36915 + left = MultiIndex.from_tuples(tuples, names=["first", "second"]) + right = MultiIndex.from_tuples( + [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], + names=["first", "second"], + ) + result = left.intersection(right) + expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "data, names, expected", + [ + ((1,), None, [None, None]), + ((1,), ["a"], [None, None]), + ((1,), ["b"], [None, None]), + ((1, 2), ["c", "d"], [None, None]), + ((1, 2), ["b", "a"], [None, None]), + ((1, 2, 3), ["a", "b", "c"], [None, None]), + ((1, 2), ["a", "c"], ["a", None]), + ((1, 2), ["c", "b"], [None, "b"]), + ((1, 2), ["a", "b"], ["a", "b"]), + ((1, 2), [None, "b"], [None, "b"]), + ], +) +def test_maybe_match_names(data, names, expected): + # GH#38323 + mi = MultiIndex.from_tuples([], names=["a", "b"]) + mi2 = MultiIndex.from_tuples([data], names=names) + result = mi._maybe_match_names(mi2) + assert result == expected + + +def test_intersection_equal_different_names(): + # GH#30302 + mi1 = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["c", "b"]) + mi2 = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + + result = mi1.intersection(mi2) + expected = MultiIndex.from_arrays([[1, 2], [3, 4]], names=[None, "b"]) + tm.assert_index_equal(result, expected) + + +def test_intersection_different_names(): + # GH#38323 + mi = MultiIndex.from_arrays([[1], [3]], names=["c", "b"]) + mi2 = MultiIndex.from_arrays([[1], [3]]) + result = mi.intersection(mi2) + tm.assert_index_equal(result, mi2) + + +def test_intersection_with_missing_values_on_both_sides(nulls_fixture): + # GH#38623 + mi1 = MultiIndex.from_arrays([[3, nulls_fixture, 4, nulls_fixture], [1, 2, 4, 2]]) + mi2 = MultiIndex.from_arrays([[3, nulls_fixture, 3], [1, 2, 4]]) + result = mi1.intersection(mi2) + expected = MultiIndex.from_arrays([[3, nulls_fixture], [1, 2]]) + tm.assert_index_equal(result, expected) + + +def test_union_with_missing_values_on_both_sides(nulls_fixture): + # GH#38623 + mi1 = MultiIndex.from_arrays([[1, nulls_fixture]]) + mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]]) + result = mi1.union(mi2) + expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +@pytest.mark.parametrize("sort", [None, False]) +def test_union_nan_got_duplicated(dtype, sort): + # GH#38977, GH#49010 + mi1 = MultiIndex.from_arrays([pd.array([1.0, np.nan], dtype=dtype), [2, 3]]) + mi2 = MultiIndex.from_arrays([pd.array([1.0, np.nan, 3.0], dtype=dtype), [2, 3, 4]]) + result = mi1.union(mi2, sort=sort) + if sort is None: + expected = MultiIndex.from_arrays( + [pd.array([1.0, 3.0, np.nan], dtype=dtype), [2, 4, 3]] + ) + else: + expected = mi2 + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("val", [4, 1]) +def test_union_keep_ea_dtype(any_numeric_ea_dtype, val): + # GH#48505 + + arr1 = Series([val, 2], dtype=any_numeric_ea_dtype) + arr2 = Series([2, 1], dtype=any_numeric_ea_dtype) + midx = MultiIndex.from_arrays([arr1, [1, 2]], names=["a", None]) + midx2 = MultiIndex.from_arrays([arr2, [2, 1]]) + result = midx.union(midx2) + if val == 4: + expected = MultiIndex.from_arrays( + [Series([1, 2, 4], dtype=any_numeric_ea_dtype), [1, 2, 1]] + ) + else: + expected = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [1, 2]] + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dupe_val", [3, pd.NA]) +def test_union_with_duplicates_keep_ea_dtype(dupe_val, any_numeric_ea_dtype): + # GH48900 + mi1 = MultiIndex.from_arrays( + [ + Series([1, dupe_val, 2], dtype=any_numeric_ea_dtype), + Series([1, dupe_val, 2], dtype=any_numeric_ea_dtype), + ] + ) + mi2 = MultiIndex.from_arrays( + [ + Series([2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype), + Series([2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype), + ] + ) + result = mi1.union(mi2) + expected = MultiIndex.from_arrays( + [ + Series([1, 2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype), + Series([1, 2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype), + ] + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") +def test_union_duplicates(index, request): + # GH#38977 + if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): + pytest.skip(f"No duplicates in an empty {type(index).__name__}") + + values = index.unique().values.tolist() + mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) + mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) + result = mi2.union(mi1) + expected = mi2.sort_values() + tm.assert_index_equal(result, expected) + + if ( + is_unsigned_integer_dtype(mi2.levels[0]) + and (mi2.get_level_values(0) < 2**63).all() + ): + # GH#47294 - union uses lib.fast_zip, converting data to Python integers + # and loses type information. Result is then unsigned only when values are + # sufficiently large to require unsigned dtype. This happens only if other + # has dups or one of both have missing values + expected = expected.set_levels( + [expected.levels[0].astype(np.int64), expected.levels[1]] + ) + elif is_float_dtype(mi2.levels[0]): + # mi2 has duplicates witch is a different path than above, Fix that path + # to use correct float dtype? + expected = expected.set_levels( + [expected.levels[0].astype(float), expected.levels[1]] + ) + + result = mi1.union(mi2) + tm.assert_index_equal(result, expected) + + +def test_union_keep_dtype_precision(any_real_numeric_dtype): + # GH#48498 + arr1 = Series([4, 1, 1], dtype=any_real_numeric_dtype) + arr2 = Series([1, 4], dtype=any_real_numeric_dtype) + midx = MultiIndex.from_arrays([arr1, [2, 1, 1]], names=["a", None]) + midx2 = MultiIndex.from_arrays([arr2, [1, 2]], names=["a", None]) + + result = midx.union(midx2) + expected = MultiIndex.from_arrays( + ([Series([1, 1, 4], dtype=any_real_numeric_dtype), [1, 1, 2]]), + names=["a", None], + ) + tm.assert_index_equal(result, expected) + + +def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype): + # GH#48498 + arr1 = Series([4, pd.NA], dtype=any_numeric_ea_dtype) + arr2 = Series([1, pd.NA], dtype=any_numeric_ea_dtype) + midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None]) + midx2 = MultiIndex.from_arrays([arr2, [1, 2]]) + result = midx.union(midx2) + expected = MultiIndex.from_arrays( + [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]] + ) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "levels1, levels2, codes1, codes2, names", + [ + ( + [["a", "b", "c"], [0, ""]], + [["c", "d", "b"], [""]], + [[0, 1, 2], [1, 1, 1]], + [[0, 1, 2], [0, 0, 0]], + ["name1", "name2"], + ), + ], +) +def test_intersection_lexsort_depth(levels1, levels2, codes1, codes2, names): + # GH#25169 + mi1 = MultiIndex(levels=levels1, codes=codes1, names=names) + mi2 = MultiIndex(levels=levels2, codes=codes2, names=names) + mi_int = mi1.intersection(mi2) + assert mi_int._lexsort_depth == 2 + + +@pytest.mark.parametrize( + "a", + [pd.Categorical(["a", "b"], categories=["a", "b"]), ["a", "b"]], +) +@pytest.mark.parametrize( + "b", + [ + pd.Categorical(["a", "b"], categories=["b", "a"], ordered=True), + pd.Categorical(["a", "b"], categories=["b", "a"]), + ], +) +def test_intersection_with_non_lex_sorted_categories(a, b): + # GH#49974 + other = ["1", "2"] + + df1 = DataFrame({"x": a, "y": other}) + df2 = DataFrame({"x": b, "y": other}) + + expected = MultiIndex.from_arrays([a, other], names=["x", "y"]) + + res1 = MultiIndex.from_frame(df1).intersection( + MultiIndex.from_frame(df2.sort_values(["x", "y"])) + ) + res2 = MultiIndex.from_frame(df1).intersection(MultiIndex.from_frame(df2)) + res3 = MultiIndex.from_frame(df1.sort_values(["x", "y"])).intersection( + MultiIndex.from_frame(df2) + ) + res4 = MultiIndex.from_frame(df1.sort_values(["x", "y"])).intersection( + MultiIndex.from_frame(df2.sort_values(["x", "y"])) + ) + + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + + +@pytest.mark.parametrize("val", [pd.NA, 100]) +def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): + # GH#48604 + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 2, val], dtype=any_numeric_ea_dtype), [1, 1, 3]] + ) + result = midx.intersection(midx2) + expected = MultiIndex.from_arrays([Series([2], dtype=any_numeric_ea_dtype), [1]]) + tm.assert_index_equal(result, expected) + + +def test_union_with_na_when_constructing_dataframe(): + # GH43222 + series1 = Series( + (1,), + index=MultiIndex.from_arrays( + [Series([None], dtype="string"), Series([None], dtype="string")] + ), + ) + series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) + result = DataFrame([series1, series2]) + expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_sorting.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..b4dcef71dcf50724c90599b03d1c1c5aa99b7916 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_sorting.py @@ -0,0 +1,349 @@ +import numpy as np +import pytest + +from pandas.errors import ( + PerformanceWarning, + UnsortedIndexError, +) + +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.core.indexes.frozen import FrozenList + + +def test_sortlevel(idx): + tuples = list(idx) + np.random.default_rng(2).shuffle(tuples) + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + +def test_sortlevel_not_sort_remaining(): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + sorted_idx, _ = mi.sortlevel("A", sort_remaining=False) + assert sorted_idx.equals(mi) + + +def test_sortlevel_deterministic(): + tuples = [ + ("bar", "one"), + ("foo", "two"), + ("qux", "two"), + ("foo", "one"), + ("baz", "two"), + ("qux", "one"), + ] + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + +def test_sortlevel_na_position(): + # GH#51612 + midx = MultiIndex.from_tuples([(1, np.nan), (1, 1)]) + result = midx.sortlevel(level=[0, 1], na_position="last")[0] + expected = MultiIndex.from_tuples([(1, 1), (1, np.nan)]) + tm.assert_index_equal(result, expected) + + +def test_numpy_argsort(idx): + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + # these are the only two types that perform + # pandas compatibility input validation - the + # rest already perform separate (or no) such + # validation via their 'values' attribute as + # defined in pandas.core.indexes/base.py - they + # cannot be changed at the moment due to + # backwards compatibility concerns + if isinstance(type(idx), (CategoricalIndex, RangeIndex)): + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, axis=1) + + msg = "the 'kind' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, kind="mergesort") + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, order=("a", "b")) + + +def test_unsortedindex(): + # GH 11897 + mi = MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + df = DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) + + # GH 16734: not sorted, but no real slicing + result = df.loc(axis=0)["z", "a"] + expected = df.iloc[0] + tm.assert_series_equal(result, expected) + + msg = ( + "MultiIndex slicing requires the index to be lexsorted: " + r"slicing on levels \[1\], lexsort depth 0" + ) + with pytest.raises(UnsortedIndexError, match=msg): + df.loc(axis=0)["z", slice("a")] + df.sort_index(inplace=True) + assert len(df.loc(axis=0)["z", :]) == 2 + + with pytest.raises(KeyError, match="'q'"): + df.loc(axis=0)["q", :] + + +def test_unsortedindex_doc_examples(): + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex + dfm = DataFrame( + { + "jim": [0, 0, 1, 1], + "joe": ["x", "x", "z", "y"], + "jolie": np.random.default_rng(2).random(4), + } + ) + + dfm = dfm.set_index(["jim", "joe"]) + with tm.assert_produces_warning(PerformanceWarning): + dfm.loc[(1, "z")] + + msg = r"Key length \(2\) was greater than MultiIndex lexsort depth \(1\)" + with pytest.raises(UnsortedIndexError, match=msg): + dfm.loc[(0, "y"):(1, "z")] + + assert not dfm.index._is_lexsorted() + assert dfm.index._lexsort_depth == 1 + + # sort it + dfm = dfm.sort_index() + dfm.loc[(1, "z")] + dfm.loc[(0, "y"):(1, "z")] + + assert dfm.index._is_lexsorted() + assert dfm.index._lexsort_depth == 2 + + +def test_reconstruct_sort(): + # starts off lexsorted & monotonic + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + assert mi.is_monotonic_increasing + recons = mi._sort_levels_monotonic() + assert recons.is_monotonic_increasing + assert mi is recons + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + assert not mi.is_monotonic_increasing + recons = mi._sort_levels_monotonic() + assert not recons.is_monotonic_increasing + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) + assert not mi.is_monotonic_increasing + recons = mi._sort_levels_monotonic() + assert not recons.is_monotonic_increasing + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + +def test_reconstruct_remove_unused(): + # xref to GH 2770 + df = DataFrame( + [["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]], + columns=["first", "second", "third"], + ) + df2 = df.set_index(["first", "second"], drop=False) + df2 = df2[df2["first"] != "deleteMe"] + + # removed levels are there + expected = MultiIndex( + levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]], + codes=[[1, 2], [1, 2]], + names=["first", "second"], + ) + result = df2.index + tm.assert_index_equal(result, expected) + + expected = MultiIndex( + levels=[["keepMe", "keepMeToo"], [2, 3]], + codes=[[0, 1], [0, 1]], + names=["first", "second"], + ) + result = df2.index.remove_unused_levels() + tm.assert_index_equal(result, expected) + + # idempotent + result2 = result.remove_unused_levels() + tm.assert_index_equal(result2, expected) + assert result2.is_(result) + + +@pytest.mark.parametrize( + "first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")] +) +def test_remove_unused_levels_large(first_type, second_type): + # GH16556 + + # because tests should be deterministic (and this test in particular + # checks that levels are removed, which is not the case for every + # random input): + rng = np.random.default_rng(10) # seed is arbitrary value that works + + size = 1 << 16 + df = DataFrame( + { + "first": rng.integers(0, 1 << 13, size).astype(first_type), + "second": rng.integers(0, 1 << 10, size).astype(second_type), + "third": rng.random(size), + } + ) + df = df.groupby(["first", "second"]).sum() + df = df[df.third < 0.1] + + result = df.index.remove_unused_levels() + assert len(result.levels[0]) < len(df.index.levels[0]) + assert len(result.levels[1]) < len(df.index.levels[1]) + assert result.equals(df.index) + + expected = df.reset_index().set_index(["first", "second"]).index + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]]) +@pytest.mark.parametrize( + "level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]] +) +def test_remove_unused_nan(level0, level1): + # GH 18417 + mi = MultiIndex(levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]]) + + result = mi.remove_unused_levels() + tm.assert_index_equal(result, mi) + for level in 0, 1: + assert "unused" not in result.levels[level] + + +def test_argsort(idx): + result = idx.argsort() + expected = idx.values.argsort() + tm.assert_numpy_array_equal(result, expected) + + +def test_remove_unused_levels_with_nan(): + # GH 37510 + idx = Index([(1, np.nan), (3, 4)]).rename(["id1", "id2"]) + idx = idx.set_levels(["a", np.nan], level="id1") + idx = idx.remove_unused_levels() + result = idx.levels + expected = FrozenList([["a", np.nan], [4]]) + assert str(result) == str(expected) + + +def test_sort_values_nan(): + # GH48495, GH48626 + midx = MultiIndex(levels=[["A", "B", "C"], ["D"]], codes=[[1, 0, 2], [-1, -1, 0]]) + result = midx.sort_values() + expected = MultiIndex( + levels=[["A", "B", "C"], ["D"]], codes=[[0, 1, 2], [-1, -1, 0]] + ) + tm.assert_index_equal(result, expected) + + +def test_sort_values_incomparable(): + # GH48495 + mi = MultiIndex.from_arrays( + [ + [1, Timestamp("2000-01-01")], + [3, 4], + ] + ) + match = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=match): + mi.sort_values() + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.parametrize("dtype", ["float64", "Int64", "Float64"]) +def test_sort_values_with_na_na_position(dtype, na_position): + # 51612 + arrays = [ + Series([1, 1, 2], dtype=dtype), + Series([1, None, 3], dtype=dtype), + ] + index = MultiIndex.from_arrays(arrays) + result = index.sort_values(na_position=na_position) + if na_position == "first": + arrays = [ + Series([1, 1, 2], dtype=dtype), + Series([None, 1, 3], dtype=dtype), + ] + else: + arrays = [ + Series([1, 1, 2], dtype=dtype), + Series([1, None, 3], dtype=dtype), + ] + expected = MultiIndex.from_arrays(arrays) + tm.assert_index_equal(result, expected) + + +def test_sort_unnecessary_warning(): + # GH#55386 + midx = MultiIndex.from_tuples([(1.5, 2), (3.5, 3), (0, 1)]) + midx = midx.set_levels([2.5, np.nan, 1], level=0) + result = midx.sort_values() + expected = MultiIndex.from_tuples([(1, 3), (2.5, 1), (np.nan, 2)]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_take.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_take.py new file mode 100644 index 0000000000000000000000000000000000000000..543cba25c373b71b8c79c7fe0ea5ae2fb7f40b18 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/multi/test_take.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_take(idx): + indexer = [4, 3, 0, 2] + result = idx.take(indexer) + expected = idx[indexer] + assert result.equals(expected) + + # GH 10791 + msg = "'MultiIndex' object has no attribute 'freq'" + with pytest.raises(AttributeError, match=msg): + idx.freq + + +def test_take_invalid_kwargs(idx): + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +def test_take_fill_value(): + # GH 12631 + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) + + result = idx.take(np.array([1, 0, -1])) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + (np.nan, pd.NaT), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 4" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c84caaca3e1b8e0773469c7ab4bb527abb19e0d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09409ab9e4b4283cd6bfa687134cd873b2ca9b2e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0325f2285b97b21018f549a1827e004a15f97c25 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b9b7c8241416c44e9ffe9379152e3c8b71123bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_numeric.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f053f045335c0f0a479891c7b62a9fe5a75ddee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_numeric.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fb531822e540ab9ff0c4af018f96b10804858f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2df6008de5d85789b026e947ac27a8036a9be7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_astype.py @@ -0,0 +1,95 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + to_datetime, + to_timedelta, +) +import pandas._testing as tm + + +class TestAstype: + def test_astype_float64_to_uint64(self): + # GH#45309 used to incorrectly return Index with int64 dtype + idx = Index([0.0, 5.0, 10.0, 15.0, 20.0], dtype=np.float64) + result = idx.astype("u8") + expected = Index([0, 5, 10, 15, 20], dtype=np.uint64) + tm.assert_index_equal(result, expected, exact=True) + + idx_with_negatives = idx - 10 + with pytest.raises(ValueError, match="losslessly"): + idx_with_negatives.astype(np.uint64) + + def test_astype_float64_to_object(self): + float_index = Index([0.0, 2.5, 5.0, 7.5, 10.0], dtype=np.float64) + result = float_index.astype(object) + assert result.equals(float_index) + assert float_index.equals(result) + assert isinstance(result, Index) and result.dtype == object + + def test_astype_float64_mixed_to_object(self): + # mixed int-float + idx = Index([1.5, 2, 3, 4, 5], dtype=np.float64) + idx.name = "foo" + result = idx.astype(object) + assert result.equals(idx) + assert idx.equals(result) + assert isinstance(result, Index) and result.dtype == object + + @pytest.mark.parametrize("dtype", ["int16", "int32", "int64"]) + def test_astype_float64_to_int_dtype(self, dtype): + # GH#12881 + # a float astype int + idx = Index([0, 1, 2], dtype=np.float64) + result = idx.astype(dtype) + expected = Index([0, 1, 2], dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + idx = Index([0, 1.1, 2], dtype=np.float64) + result = idx.astype(dtype) + expected = Index([0, 1, 2], dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + @pytest.mark.parametrize("dtype", ["float32", "float64"]) + def test_astype_float64_to_float_dtype(self, dtype): + # GH#12881 + # a float astype int + idx = Index([0, 1, 2], dtype=np.float64) + result = idx.astype(dtype) + assert isinstance(result, Index) and result.dtype == dtype + + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_astype_float_to_datetimelike(self, dtype): + # GH#49660 pre-2.0 Index.astype from floating to M8/m8/Period raised, + # inconsistent with Series.astype + idx = Index([0, 1.1, 2], dtype=np.float64) + + result = idx.astype(dtype) + if dtype[0] == "M": + expected = to_datetime(idx.values) + else: + expected = to_timedelta(idx.values) + tm.assert_index_equal(result, expected) + + # check that we match Series behavior + result = idx.to_series().set_axis(range(3)).astype(dtype) + expected = expected.to_series().set_axis(range(3)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [int, "int16", "int32", "int64"]) + @pytest.mark.parametrize("non_finite", [np.inf, np.nan]) + def test_cannot_cast_inf_to_int(self, non_finite, dtype): + # GH#13149 + idx = Index([1, 2, non_finite], dtype=np.float64) + + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + idx.astype(dtype) + + def test_astype_from_object(self): + index = Index([1.0, np.nan, 0.2], dtype="object") + result = index.astype(float) + expected = Index([1.0, np.nan, 0.2], dtype=np.float64) + assert result.dtype == expected.dtype + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..cd28d519313ed36228040361dfbb2a8dccf77be5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_indexing.py @@ -0,0 +1,611 @@ +import numpy as np +import pytest + +from pandas.errors import InvalidIndexError + +from pandas import ( + NA, + Index, + RangeIndex, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import ( + ArrowExtensionArray, + FloatingArray, +) + + +@pytest.fixture +def index_large(): + # large values used in Index[uint64] tests where no compat needed with Int64/Float64 + large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] + return Index(large, dtype=np.uint64) + + +class TestGetLoc: + def test_get_loc(self): + index = Index([0, 1, 2]) + assert index.get_loc(1) == 1 + + def test_get_loc_raises_bad_label(self): + index = Index([0, 1, 2]) + with pytest.raises(InvalidIndexError, match=r"\[1, 2\]"): + index.get_loc([1, 2]) + + def test_get_loc_float64(self): + idx = Index([0.0, 1.0, 2.0], dtype=np.float64) + + with pytest.raises(KeyError, match="^'foo'$"): + idx.get_loc("foo") + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5) + with pytest.raises(KeyError, match="^True$"): + idx.get_loc(True) + with pytest.raises(KeyError, match="^False$"): + idx.get_loc(False) + + def test_get_loc_na(self): + idx = Index([np.nan, 1, 2], dtype=np.float64) + assert idx.get_loc(1) == 1 + assert idx.get_loc(np.nan) == 0 + + idx = Index([np.nan, 1, np.nan], dtype=np.float64) + assert idx.get_loc(1) == 1 + + # representable by slice [0:2:2] + msg = "'Cannot get left slice bound for non-unique label: nan'" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) + # not representable by slice + idx = Index([np.nan, 1, np.nan, np.nan], dtype=np.float64) + assert idx.get_loc(1) == 1 + msg = "'Cannot get left slice bound for non-unique label: nan" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) + + def test_get_loc_missing_nan(self): + # GH#8569 + idx = Index([1, 2], dtype=np.float64) + assert idx.get_loc(1) == 0 + with pytest.raises(KeyError, match=r"^3$"): + idx.get_loc(3) + with pytest.raises(KeyError, match="^nan$"): + idx.get_loc(np.nan) + with pytest.raises(InvalidIndexError, match=r"\[nan\]"): + # listlike/non-hashable raises TypeError + idx.get_loc([np.nan]) + + @pytest.mark.parametrize("vals", [[1], [1.0], [Timestamp("2019-12-31")], ["test"]]) + def test_get_loc_float_index_nan_with_method(self, vals): + # GH#39382 + idx = Index(vals) + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) + + @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) + def test_get_loc_numericindex_none_raises(self, dtype): + # case that goes through searchsorted and key is non-comparable to values + arr = np.arange(10**7, dtype=dtype) + idx = Index(arr) + with pytest.raises(KeyError, match="None"): + idx.get_loc(None) + + def test_get_loc_overflows(self): + # unique but non-monotonic goes through IndexEngine.mapping.get_item + idx = Index([0, 2, 1]) + + val = np.iinfo(np.int64).max + 1 + + with pytest.raises(KeyError, match=str(val)): + idx.get_loc(val) + with pytest.raises(KeyError, match=str(val)): + idx._engine.get_loc(val) + + +class TestGetIndexer: + def test_get_indexer(self): + index1 = Index([1, 2, 3, 4, 5]) + index2 = Index([2, 4, 6]) + + r1 = index1.get_indexer(index2) + e1 = np.array([1, 3, -1], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + @pytest.mark.parametrize("reverse", [True, False]) + @pytest.mark.parametrize( + "expected,method", + [ + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "pad"), + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "ffill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "backfill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "bfill"), + ], + ) + def test_get_indexer_methods(self, reverse, expected, method): + index1 = Index([1, 2, 3, 4, 5]) + index2 = Index([2, 4, 6]) + + if reverse: + index1 = index1[::-1] + expected = expected[::-1] + + result = index2.get_indexer(index1, method=method) + tm.assert_almost_equal(result, expected) + + def test_get_indexer_invalid(self): + # GH10411 + index = Index(np.arange(10)) + + with pytest.raises(ValueError, match="tolerance argument"): + index.get_indexer([1, 0], tolerance=1) + + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], limit=1) + + @pytest.mark.parametrize( + "method, tolerance, indexer, expected", + [ + ("pad", None, [0, 5, 9], [0, 5, 9]), + ("backfill", None, [0, 5, 9], [0, 5, 9]), + ("nearest", None, [0, 5, 9], [0, 5, 9]), + ("pad", 0, [0, 5, 9], [0, 5, 9]), + ("backfill", 0, [0, 5, 9], [0, 5, 9]), + ("nearest", 0, [0, 5, 9], [0, 5, 9]), + ("pad", None, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", None, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", None, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 1, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", 1, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", 1, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), + ("backfill", 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), + ("nearest", 0.2, [0.2, 1.8, 8.5], [0, 2, -1]), + ], + ) + def test_get_indexer_nearest(self, method, tolerance, indexer, expected): + index = Index(np.arange(10)) + + actual = index.get_indexer(indexer, method=method, tolerance=tolerance) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + @pytest.mark.parametrize("listtype", [list, tuple, Series, np.array]) + @pytest.mark.parametrize( + "tolerance, expected", + list( + zip( + [[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], [0.1, 0.5, 0.5]], + [[0, 2, -1], [0, -1, -1], [-1, 2, 9]], + ) + ), + ) + def test_get_indexer_nearest_listlike_tolerance( + self, tolerance, expected, listtype + ): + index = Index(np.arange(10)) + + actual = index.get_indexer( + [0.2, 1.8, 8.5], method="nearest", tolerance=listtype(tolerance) + ) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + def test_get_indexer_nearest_error(self): + index = Index(np.arange(10)) + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], method="nearest", limit=1) + + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_indexer([1, 0], method="nearest", tolerance=[1, 2, 3]) + + @pytest.mark.parametrize( + "method,expected", + [("pad", [8, 7, 0]), ("backfill", [9, 8, 1]), ("nearest", [9, 7, 0])], + ) + def test_get_indexer_nearest_decreasing(self, method, expected): + index = Index(np.arange(10))[::-1] + + actual = index.get_indexer([0, 5, 9], method=method) + tm.assert_numpy_array_equal(actual, np.array([9, 4, 0], dtype=np.intp)) + + actual = index.get_indexer([0.2, 1.8, 8.5], method=method) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + @pytest.mark.parametrize("idx_dtype", ["int64", "float64", "uint64", "range"]) + @pytest.mark.parametrize("method", ["get_indexer", "get_indexer_non_unique"]) + def test_get_indexer_numeric_index_boolean_target(self, method, idx_dtype): + # GH 16877 + + if idx_dtype == "range": + numeric_index = RangeIndex(4) + else: + numeric_index = Index(np.arange(4, dtype=idx_dtype)) + + other = Index([True, False, True]) + + result = getattr(numeric_index, method)(other) + expected = np.array([-1, -1, -1], dtype=np.intp) + if method == "get_indexer": + tm.assert_numpy_array_equal(result, expected) + else: + missing = np.arange(3, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], expected) + tm.assert_numpy_array_equal(result[1], missing) + + @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) + def test_get_indexer_with_method_numeric_vs_bool(self, method): + left = Index([1, 2, 3]) + right = Index([True, False]) + + with pytest.raises(TypeError, match="Cannot compare"): + left.get_indexer(right, method=method) + + with pytest.raises(TypeError, match="Cannot compare"): + right.get_indexer(left, method=method) + + def test_get_indexer_numeric_vs_bool(self): + left = Index([1, 2, 3]) + right = Index([True, False]) + + res = left.get_indexer(right) + expected = -1 * np.ones(len(right), dtype=np.intp) + tm.assert_numpy_array_equal(res, expected) + + res = right.get_indexer(left) + expected = -1 * np.ones(len(left), dtype=np.intp) + tm.assert_numpy_array_equal(res, expected) + + res = left.get_indexer_non_unique(right)[0] + expected = -1 * np.ones(len(right), dtype=np.intp) + tm.assert_numpy_array_equal(res, expected) + + res = right.get_indexer_non_unique(left)[0] + expected = -1 * np.ones(len(left), dtype=np.intp) + tm.assert_numpy_array_equal(res, expected) + + def test_get_indexer_float64(self): + idx = Index([0.0, 1.0, 2.0], dtype=np.float64) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = [-0.1, 0.5, 1.1] + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + def test_get_indexer_nan(self): + # GH#7820 + result = Index([1, 2, np.nan], dtype=np.float64).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_int64(self): + index = Index(range(0, 20, 2), dtype=np.int64) + target = Index(np.arange(10), dtype=np.int64) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Index(np.arange(10), dtype=np.int64) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Index(np.arange(10), dtype=np.int64) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_uint64(self, index_large): + target = Index(np.arange(10).astype("uint64") * 5 + 2**63) + indexer = index_large.get_indexer(target) + expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Index(np.arange(10).astype("uint64") * 5 + 2**63) + indexer = index_large.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Index(np.arange(10).astype("uint64") * 5 + 2**63) + indexer = index_large.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + @pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)]) + def test_get_loc_masked(self, val, val2, any_numeric_ea_and_arrow_dtype): + # GH#39133 + idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.get_loc(2) + assert result == 1 + + with pytest.raises(KeyError, match="9"): + idx.get_loc(9) + + def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype): + # GH#39133 + idx = Index([1, 2, NA], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.get_loc(NA) + assert result == 2 + + idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.get_loc(NA) + tm.assert_numpy_array_equal(result, np.array([False, False, True, True])) + + idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype) + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + def test_get_loc_masked_na_and_nan(self): + # GH#39133 + idx = Index( + FloatingArray( + np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) + ) + ) + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 + + idx = Index( + FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) + ) + result = idx.get_loc(NA) + assert result == 2 + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) + + idx = Index( + FloatingArray( + np.array([1, 2, np.nan]), mask=np.array([False, False, False]) + ) + ) + result = idx.get_loc(np.nan) + assert result == 2 + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + @pytest.mark.parametrize("val", [4, 2]) + def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val): + # GH#39133 + idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.get_indexer_for([1, NA, 5]) + expected = np.array([0, 2, -1]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("dtype", ["boolean", "bool[pyarrow]"]) + def test_get_indexer_masked_na_boolean(self, dtype): + # GH#39133 + if dtype == "bool[pyarrow]": + pytest.importorskip("pyarrow") + idx = Index([True, False, NA], dtype=dtype) + result = idx.get_loc(False) + assert result == 1 + result = idx.get_loc(NA) + assert result == 2 + + def test_get_indexer_arrow_dictionary_target(self): + pa = pytest.importorskip("pyarrow") + target = Index( + ArrowExtensionArray( + pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8())) + ) + ) + idx = Index([1]) + + result = idx.get_indexer(target) + expected = np.array([0, -1], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result_1, result_2 = idx.get_indexer_non_unique(target) + expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array( + [1], dtype=np.int64 + ) + tm.assert_numpy_array_equal(result_1, expected_1) + tm.assert_numpy_array_equal(result_2, expected_2) + + +class TestWhere: + @pytest.mark.parametrize( + "index", + [ + Index(np.arange(5, dtype="float64")), + Index(range(0, 20, 2), dtype=np.int64), + Index(np.arange(5, dtype="uint64")), + ], + ) + def test_where(self, listlike_box, index): + cond = [True] * len(index) + expected = index + result = index.where(listlike_box(cond)) + + cond = [False] + [True] * (len(index) - 1) + expected = Index([index._na_value] + index[1:].tolist(), dtype=np.float64) + result = index.where(listlike_box(cond)) + tm.assert_index_equal(result, expected) + + def test_where_uint64(self): + idx = Index([0, 6, 2], dtype=np.uint64) + mask = np.array([False, True, False]) + other = np.array([1], dtype=np.int64) + + expected = Index([1, 6, 1], dtype=np.uint64) + + result = idx.where(mask, other) + tm.assert_index_equal(result, expected) + + result = idx.putmask(~mask, other) + tm.assert_index_equal(result, expected) + + def test_where_infers_type_instead_of_trying_to_convert_string_to_float(self): + # GH 32413 + index = Index([1, np.nan]) + cond = index.notna() + other = Index(["a", "b"], dtype="string") + + expected = Index([1.0, "b"]) + result = index.where(cond, other) + + tm.assert_index_equal(result, expected) + + +class TestTake: + @pytest.mark.parametrize("idx_dtype", [np.float64, np.int64, np.uint64]) + def test_take_preserve_name(self, idx_dtype): + index = Index([1, 2, 3, 4], dtype=idx_dtype, name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value_float64(self): + # GH 12631 + idx = Index([1.0, 2.0, 3.0], name="xxx", dtype=np.float64) + result = idx.take(np.array([1, 0, -1])) + expected = Index([2.0, 1.0, 3.0], dtype=np.float64, name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = Index([2.0, 1.0, np.nan], dtype=np.float64, name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = Index([2.0, 1.0, 3.0], dtype=np.float64, name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) + def test_take_fill_value_ints(self, dtype): + # see gh-12631 + idx = Index([1, 2, 3], dtype=dtype, name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = Index([2, 1, 3], dtype=dtype, name="xxx") + tm.assert_index_equal(result, expected) + + name = type(idx).__name__ + msg = f"Unable to fill values because {name} cannot contain NA" + + # fill_value=True + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = Index([2, 1, 3], dtype=dtype, name="xxx") + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestContains: + @pytest.mark.parametrize("dtype", [np.float64, np.int64, np.uint64]) + def test_contains_none(self, dtype): + # GH#35788 should return False, not raise TypeError + index = Index([0, 1, 2, 3, 4], dtype=dtype) + assert None not in index + + def test_contains_float64_nans(self): + index = Index([1.0, 2.0, np.nan], dtype=np.float64) + assert np.nan in index + + def test_contains_float64_not_nans(self): + index = Index([1.0, 2.0, np.nan], dtype=np.float64) + assert 1.0 in index + + +class TestSliceLocs: + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_locs(self, dtype): + index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) + n = len(index) + + assert index.slice_locs(start=2) == (2, n) + assert index.slice_locs(start=3) == (3, n) + assert index.slice_locs(3, 8) == (3, 6) + assert index.slice_locs(5, 10) == (3, n) + assert index.slice_locs(end=8) == (0, 6) + assert index.slice_locs(end=9) == (0, 7) + + # reversed + index2 = index[::-1] + assert index2.slice_locs(8, 2) == (2, 6) + assert index2.slice_locs(7, 3) == (2, 5) + + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_locs_float_locs(self, dtype): + index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) + n = len(index) + assert index.slice_locs(5.0, 10.0) == (3, n) + assert index.slice_locs(4.5, 10.5) == (3, 8) + + index2 = index[::-1] + assert index2.slice_locs(8.5, 1.5) == (2, 6) + assert index2.slice_locs(10.5, -1) == (0, n) + + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_locs_dup_numeric(self, dtype): + index = Index(np.array([10, 12, 12, 14], dtype=dtype)) + assert index.slice_locs(12, 12) == (1, 3) + assert index.slice_locs(11, 13) == (1, 3) + + index2 = index[::-1] + assert index2.slice_locs(12, 12) == (1, 3) + assert index2.slice_locs(13, 11) == (1, 3) + + def test_slice_locs_na(self): + index = Index([np.nan, 1, 2]) + assert index.slice_locs(1) == (1, 3) + assert index.slice_locs(np.nan) == (0, 3) + + index = Index([0, np.nan, np.nan, 1, 2]) + assert index.slice_locs(np.nan) == (1, 5) + + def test_slice_locs_na_raises(self): + index = Index([np.nan, 1, 2]) + with pytest.raises(KeyError, match=""): + index.slice_locs(start=1.5) + + with pytest.raises(KeyError, match=""): + index.slice_locs(end=1.5) + + +class TestGetSliceBounds: + @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) + def test_get_slice_bounds_within(self, side, expected): + index = Index(range(6)) + result = index.get_slice_bound(4, side=side) + assert result == expected + + @pytest.mark.parametrize("side", ["left", "right"]) + @pytest.mark.parametrize("bound, expected", [(-1, 0), (10, 6)]) + def test_get_slice_bounds_outside(self, side, expected, bound): + index = Index(range(6)) + result = index.get_slice_bound(bound, side=side) + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..918d5052167356b1d51018434c03e6682f828872 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_join.py @@ -0,0 +1,380 @@ +import numpy as np +import pytest + +import pandas._testing as tm +from pandas.core.indexes.api import Index + + +class TestJoinInt64Index: + def test_join_non_unique(self): + left = Index([4, 4, 3, 3]) + + joined, lidx, ridx = left.join(left, return_indexers=True) + + exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3]) + tm.assert_index_equal(joined, exp_joined) + + exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + def test_join_inner(self): + index = Index(range(0, 20, 2), dtype=np.int64) + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + + # not monotonic + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Index([2, 12], dtype=np.int64) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([4, 1], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True) + + res2 = index.intersection(other_mono) + tm.assert_index_equal(res, res2) + + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([1, 4], dtype=np.intp) + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self): + index = Index(range(0, 20, 2), dtype=np.int64) + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + + # not monotonic + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + eres = index + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # non-unique + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) + eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self): + index = Index(range(0, 20, 2), dtype=np.int64) + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + + # not monotonic + res, lidx, ridx = index.join(other, how="right", return_indexers=True) + eres = other + elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) + + assert isinstance(other, Index) and other.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + # monotonic + res, lidx, ridx = index.join(other_mono, how="right", return_indexers=True) + eres = other_mono + elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) + assert isinstance(other, Index) and other.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + # non-unique + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) + eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_non_int_index(self): + index = Index(range(0, 20, 2), dtype=np.int64) + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = index.join(other, how="outer") + outer2 = other.join(index, how="outer") + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) + tm.assert_index_equal(outer, outer2) + tm.assert_index_equal(outer, expected) + + inner = index.join(other, how="inner") + inner2 = other.join(index, how="inner") + expected = Index([6, 8, 10]) + tm.assert_index_equal(inner, inner2) + tm.assert_index_equal(inner, expected) + + left = index.join(other, how="left") + tm.assert_index_equal(left, index.astype(object)) + + left2 = other.join(index, how="left") + tm.assert_index_equal(left2, other) + + right = index.join(other, how="right") + tm.assert_index_equal(right, other) + + right2 = other.join(index, how="right") + tm.assert_index_equal(right2, index.astype(object)) + + def test_join_outer(self): + index = Index(range(0, 20, 2), dtype=np.int64) + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + eres = Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25], dtype=np.int64) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], dtype=np.intp + ) + + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="outer", return_indexers=True) + noidx_res = index.join(other_mono, how="outer") + tm.assert_index_equal(res, noidx_res) + + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.intp + ) + assert isinstance(res, Index) and res.dtype == np.int64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + +class TestJoinUInt64Index: + @pytest.fixture + def index_large(self): + # large values used in TestUInt64Index where no compat needed with int64/float64 + large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] + return Index(large, dtype=np.uint64) + + def test_join_inner(self, index_large): + other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64")) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="inner", return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Index(2**63 + np.array([10, 25], dtype="uint64")) + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([5, 2], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="inner", return_indexers=True + ) + + res2 = index_large.intersection(other_mono) + tm.assert_index_equal(res, res2) + + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([3, 5], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self, index_large): + other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64")) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="left", return_indexers=True) + eres = index_large + eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # non-unique + idx = Index(2**63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = Index(2**63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) + + # 1 is in idx2, so it should be x2 + eres = Index(2**63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self, index_large): + other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64")) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="right", return_indexers=True) + eres = other + elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(lidx, elidx) + assert isinstance(other, Index) and other.dtype == np.uint64 + tm.assert_index_equal(res, eres) + assert ridx is None + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="right", return_indexers=True + ) + eres = other_mono + elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) + + assert isinstance(other, Index) and other.dtype == np.uint64 + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_index_equal(res, eres) + assert ridx is None + + # non-unique + idx = Index(2**63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = Index(2**63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) + + # 1 is in idx2, so it should be x2 + eres = Index(2**63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_non_int_index(self, index_large): + other = Index( + 2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object + ) + + outer = index_large.join(other, how="outer") + outer2 = other.join(index_large, how="outer") + expected = Index( + 2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") + ) + tm.assert_index_equal(outer, outer2) + tm.assert_index_equal(outer, expected) + + inner = index_large.join(other, how="inner") + inner2 = other.join(index_large, how="inner") + expected = Index(2**63 + np.array([10, 20], dtype="uint64")) + tm.assert_index_equal(inner, inner2) + tm.assert_index_equal(inner, expected) + + left = index_large.join(other, how="left") + tm.assert_index_equal(left, index_large.astype(object)) + + left2 = other.join(index_large, how="left") + tm.assert_index_equal(left2, other) + + right = index_large.join(other, how="right") + tm.assert_index_equal(right, other) + + right2 = other.join(index_large, how="right") + tm.assert_index_equal(right2, index_large.astype(object)) + + def test_join_outer(self, index_large): + other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64")) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = index_large.join(other, how="outer", return_indexers=True) + noidx_res = index_large.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + eres = Index( + 2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") + ) + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="outer", return_indexers=True + ) + noidx_res = index_large.join(other_mono, how="outer") + tm.assert_index_equal(res, noidx_res) + + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 0, 1, 2, 3, 4, -1, -1, 5], dtype=np.intp) + + assert isinstance(res, Index) and res.dtype == np.uint64 + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_numeric.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_numeric.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd807e1827ddc4faf900f15dcefa18c08d4cd0b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_numeric.py @@ -0,0 +1,553 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +class TestFloatNumericIndex: + @pytest.fixture(params=[np.float64, np.float32]) + def dtype(self, request): + return request.param + + @pytest.fixture + def simple_index(self, dtype): + values = np.arange(5, dtype=dtype) + return Index(values) + + @pytest.fixture( + params=[ + [1.5, 2, 3, 4, 5], + [0.0, 2.5, 5.0, 7.5, 10.0], + [5, 4, 3, 2, 1.5], + [10.0, 7.5, 5.0, 2.5, 0.0], + ], + ids=["mixed", "float", "mixed_dec", "float_dec"], + ) + def index(self, request, dtype): + return Index(request.param, dtype=dtype) + + @pytest.fixture + def mixed_index(self, dtype): + return Index([1.5, 2, 3, 4, 5], dtype=dtype) + + @pytest.fixture + def float_index(self, dtype): + return Index([0.0, 2.5, 5.0, 7.5, 10.0], dtype=dtype) + + def test_repr_roundtrip(self, index): + tm.assert_index_equal(eval(repr(index)), index, exact=True) + + def check_coerce(self, a, b, is_float_index=True): + assert a.equals(b) + tm.assert_index_equal(a, b, exact=False) + if is_float_index: + assert isinstance(b, Index) + else: + assert type(b) is Index + + def test_constructor_from_list_no_dtype(self): + index = Index([1.5, 2.5, 3.5]) + assert index.dtype == np.float64 + + def test_constructor(self, dtype): + index_cls = Index + + # explicit construction + index = index_cls([1, 2, 3, 4, 5], dtype=dtype) + + assert isinstance(index, index_cls) + assert index.dtype == dtype + + expected = np.array([1, 2, 3, 4, 5], dtype=dtype) + tm.assert_numpy_array_equal(index.values, expected) + + index = index_cls(np.array([1, 2, 3, 4, 5]), dtype=dtype) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls([1.0, 2, 3, 4, 5], dtype=dtype) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=dtype) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls([1.0, 2, 3, 4, 5], dtype=dtype) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=dtype) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + # nan handling + result = index_cls([np.nan, np.nan], dtype=dtype) + assert pd.isna(result.values).all() + + result = index_cls(np.array([np.nan]), dtype=dtype) + assert pd.isna(result.values).all() + + def test_constructor_invalid(self): + index_cls = Index + cls_name = index_cls.__name__ + # invalid + msg = ( + rf"{cls_name}\(\.\.\.\) must be called with a collection of " + r"some kind, 0\.0 was passed" + ) + with pytest.raises(TypeError, match=msg): + index_cls(0.0) + + def test_constructor_coerce(self, mixed_index, float_index): + self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5])) + self.check_coerce(float_index, Index(np.arange(5) * 2.5)) + + result = Index(np.array(np.arange(5) * 2.5, dtype=object)) + assert result.dtype == object # as of 2.0 to match Series + self.check_coerce(float_index, result.astype("float64")) + + def test_constructor_explicit(self, mixed_index, float_index): + # these don't auto convert + self.check_coerce( + float_index, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False + ) + self.check_coerce( + mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False + ) + + def test_type_coercion_fail(self, any_int_numpy_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + Index([1, 2, 3.5], dtype=any_int_numpy_dtype) + + def test_equals_numeric(self): + index_cls = Index + + idx = index_cls([1.0, 2.0]) + assert idx.equals(idx) + assert idx.identical(idx) + + idx2 = index_cls([1.0, 2.0]) + assert idx.equals(idx2) + + idx = index_cls([1.0, np.nan]) + assert idx.equals(idx) + assert idx.identical(idx) + + idx2 = index_cls([1.0, np.nan]) + assert idx.equals(idx2) + + @pytest.mark.parametrize( + "other", + ( + Index([1, 2], dtype=np.int64), + Index([1.0, 2.0], dtype=object), + Index([1, 2], dtype=object), + ), + ) + def test_equals_numeric_other_index_type(self, other): + idx = Index([1.0, 2.0]) + assert idx.equals(other) + assert other.equals(idx) + + @pytest.mark.parametrize( + "vals", + [ + pd.date_range("2016-01-01", periods=3), + pd.timedelta_range("1 Day", periods=3), + ], + ) + def test_lookups_datetimelike_values(self, vals, dtype): + # If we have datetime64 or timedelta64 values, make sure they are + # wrapped correctly GH#31163 + ser = Series(vals, index=range(3, 6)) + ser.index = ser.index.astype(dtype) + + expected = vals[1] + + result = ser[4.0] + assert isinstance(result, type(expected)) and result == expected + result = ser[4] + assert isinstance(result, type(expected)) and result == expected + + result = ser.loc[4.0] + assert isinstance(result, type(expected)) and result == expected + result = ser.loc[4] + assert isinstance(result, type(expected)) and result == expected + + result = ser.at[4.0] + assert isinstance(result, type(expected)) and result == expected + # GH#31329 .at[4] should cast to 4.0, matching .loc behavior + result = ser.at[4] + assert isinstance(result, type(expected)) and result == expected + + result = ser.iloc[1] + assert isinstance(result, type(expected)) and result == expected + + result = ser.iat[1] + assert isinstance(result, type(expected)) and result == expected + + def test_doesnt_contain_all_the_things(self): + idx = Index([np.nan]) + assert not idx.isin([0]).item() + assert not idx.isin([1]).item() + assert idx.isin([np.nan]).item() + + def test_nan_multiple_containment(self): + index_cls = Index + + idx = index_cls([1.0, np.nan]) + tm.assert_numpy_array_equal(idx.isin([1.0]), np.array([True, False])) + tm.assert_numpy_array_equal(idx.isin([2.0, np.pi]), np.array([False, False])) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal(idx.isin([1.0, np.nan]), np.array([True, True])) + idx = index_cls([1.0, 2.0]) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, False])) + + def test_fillna_float64(self): + index_cls = Index + # GH 11343 + idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") + # can't downcast + exp = Index([1.0, 0.1, 3.0], name="x") + tm.assert_index_equal(idx.fillna(0.1), exp, exact=True) + + # downcast + exp = index_cls([1.0, 2.0, 3.0], name="x") + tm.assert_index_equal(idx.fillna(2), exp) + + # object + exp = Index([1.0, "obj", 3.0], name="x") + tm.assert_index_equal(idx.fillna("obj"), exp, exact=True) + + def test_logical_compat(self, simple_index): + idx = simple_index + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + assert idx.all() == idx.to_series().all() + assert idx.any() == idx.to_series().any() + + +class TestNumericInt: + @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8, np.uint64]) + def dtype(self, request): + return request.param + + @pytest.fixture + def simple_index(self, dtype): + return Index(range(0, 20, 2), dtype=dtype) + + def test_is_monotonic(self): + index_cls = Index + + index = index_cls([1, 2, 3, 4]) + assert index.is_monotonic_increasing is True + assert index.is_monotonic_increasing is True + assert index._is_strictly_monotonic_increasing is True + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_decreasing is False + + index = index_cls([4, 3, 2, 1]) + assert index.is_monotonic_increasing is False + assert index._is_strictly_monotonic_increasing is False + assert index._is_strictly_monotonic_decreasing is True + + index = index_cls([1]) + assert index.is_monotonic_increasing is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is True + + def test_is_strictly_monotonic(self): + index_cls = Index + + index = index_cls([1, 1, 2, 3]) + assert index.is_monotonic_increasing is True + assert index._is_strictly_monotonic_increasing is False + + index = index_cls([3, 2, 1, 1]) + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_decreasing is False + + index = index_cls([1, 1]) + assert index.is_monotonic_increasing + assert index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing + + def test_logical_compat(self, simple_index): + idx = simple_index + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + def test_identical(self, simple_index, dtype): + index = simple_index + + idx = Index(index.copy()) + assert idx.identical(index) + + same_values_different_type = Index(idx, dtype=object) + assert not idx.identical(same_values_different_type) + + idx = index.astype(dtype=object) + idx = idx.rename("foo") + same_values = Index(idx, dtype=object) + assert same_values.identical(idx) + + assert not idx.identical(index) + assert Index(same_values, name="foo", dtype=object).identical(idx) + + assert not index.astype(dtype=object).identical(index.astype(dtype=dtype)) + + def test_cant_or_shouldnt_cast(self, dtype): + msg = r"invalid literal for int\(\) with base 10: 'foo'" + + # can't + data = ["foo", "bar", "baz"] + with pytest.raises(ValueError, match=msg): + Index(data, dtype=dtype) + + def test_view_index(self, simple_index): + index = simple_index + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.view(Index) + + def test_prevent_casting(self, simple_index): + index = simple_index + result = index.astype("O") + assert result.dtype == np.object_ + + +class TestIntNumericIndex: + @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8]) + def dtype(self, request): + return request.param + + def test_constructor_from_list_no_dtype(self): + index = Index([1, 2, 3]) + assert index.dtype == np.int64 + + def test_constructor(self, dtype): + index_cls = Index + + # scalar raise Exception + msg = ( + rf"{index_cls.__name__}\(\.\.\.\) must be called with a collection of some " + "kind, 5 was passed" + ) + with pytest.raises(TypeError, match=msg): + index_cls(5) + + # copy + # pass list, coerce fine + index = index_cls([-5, 0, 1, 2], dtype=dtype) + arr = index.values.copy() + new_index = index_cls(arr, copy=True) + tm.assert_index_equal(new_index, index, exact=True) + val = int(arr[0]) + 3000 + + # this should not change index + if dtype != np.int8: + # NEP 50 won't allow assignment that would overflow + arr[0] = val + assert new_index[0] != val + + if dtype == np.int64: + # pass list, coerce fine + index = index_cls([-5, 0, 1, 2], dtype=dtype) + expected = Index([-5, 0, 1, 2], dtype=dtype) + tm.assert_index_equal(index, expected) + + # from iterable + index = index_cls(iter([-5, 0, 1, 2]), dtype=dtype) + expected = index_cls([-5, 0, 1, 2], dtype=dtype) + tm.assert_index_equal(index, expected, exact=True) + + # interpret list-like + expected = index_cls([5, 0], dtype=dtype) + for cls in [Index, index_cls]: + for idx in [ + cls([5, 0], dtype=dtype), + cls(np.array([5, 0]), dtype=dtype), + cls(Series([5, 0]), dtype=dtype), + ]: + tm.assert_index_equal(idx, expected) + + def test_constructor_corner(self, dtype): + index_cls = Index + + arr = np.array([1, 2, 3, 4], dtype=object) + + index = index_cls(arr, dtype=dtype) + assert index.values.dtype == index.dtype + if dtype == np.int64: + without_dtype = Index(arr) + # as of 2.0 we do not infer a dtype when we get an object-dtype + # ndarray of numbers, matching Series behavior + assert without_dtype.dtype == object + + tm.assert_index_equal(index, without_dtype.astype(np.int64)) + + # preventing casting + arr = np.array([1, "2", 3, "4"], dtype=object) + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + index_cls(arr, dtype=dtype) + + def test_constructor_coercion_signed_to_unsigned( + self, + any_unsigned_int_numpy_dtype, + ): + # see gh-15832 + msg = "|".join( + [ + "Trying to coerce negative values to unsigned integers", + "The elements provided in the data cannot all be casted", + ] + ) + with pytest.raises(OverflowError, match=msg): + Index([-1], dtype=any_unsigned_int_numpy_dtype) + + def test_constructor_np_signed(self, any_signed_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_signed_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = Index([1], dtype=any_signed_int_numpy_dtype) + tm.assert_index_equal(result, expected, exact=True) + + def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = Index([1], dtype=any_unsigned_int_numpy_dtype) + tm.assert_index_equal(result, expected, exact=True) + + def test_coerce_list(self): + # coerce things + arr = Index([1, 2, 3, 4]) + assert isinstance(arr, Index) + + # but not if explicit dtype passed + arr = Index([1, 2, 3, 4], dtype=object) + assert type(arr) is Index + + +class TestFloat16Index: + # float 16 indexes not supported + # GH 49535 + def test_constructor(self): + index_cls = Index + dtype = np.float16 + + msg = "float16 indexes are not supported" + + # explicit construction + with pytest.raises(NotImplementedError, match=msg): + index_cls([1, 2, 3, 4, 5], dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls(np.array([1, 2, 3, 4, 5]), dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls([1.0, 2, 3, 4, 5], dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls([1.0, 2, 3, 4, 5], dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=dtype) + + # nan handling + with pytest.raises(NotImplementedError, match=msg): + index_cls([np.nan, np.nan], dtype=dtype) + + with pytest.raises(NotImplementedError, match=msg): + index_cls(np.array([np.nan]), dtype=dtype) + + +@pytest.mark.parametrize( + "box", + [list, lambda x: np.array(x, dtype=object), lambda x: Index(x, dtype=object)], +) +def test_uint_index_does_not_convert_to_float64(box): + # https://github.com/pandas-dev/pandas/issues/28279 + # https://github.com/pandas-dev/pandas/issues/28023 + series = Series( + [0, 1, 2, 3, 4, 5], + index=[ + 7606741985629028552, + 17876870360202815256, + 17876870360202815256, + 13106359306506049338, + 8991270399732411471, + 8991270399732411472, + ], + ) + + result = series.loc[box([7606741985629028552, 17876870360202815256])] + + expected = Index( + [7606741985629028552, 17876870360202815256, 17876870360202815256], + dtype="uint64", + ) + tm.assert_index_equal(result.index, expected) + + tm.assert_equal(result, series.iloc[:3]) + + +def test_float64_index_equals(): + # https://github.com/pandas-dev/pandas/issues/35217 + float_index = Index([1.0, 2, 3]) + string_index = Index(["1", "2", "3"]) + + result = float_index.equals(string_index) + assert result is False + + result = string_index.equals(float_index) + assert result is False + + +def test_map_dtype_inference_unsigned_to_signed(): + # GH#44609 cases where we don't retain dtype + idx = Index([1, 2, 3], dtype=np.uint64) + result = idx.map(lambda x: -x) + expected = Index([-1, -2, -3], dtype=np.int64) + tm.assert_index_equal(result, expected) + + +def test_map_dtype_inference_overflows(): + # GH#44609 case where we have to upcast + idx = Index(np.array([1, 2, 3], dtype=np.int8)) + result = idx.map(lambda x: x * 1000) + # TODO: we could plausibly try to infer down to int16 here + expected = Index([1000, 2000, 3000], dtype=np.int64) + tm.assert_index_equal(result, expected) + + +def test_view_to_datetimelike(): + # GH#55710 + idx = Index([1, 2, 3]) + res = idx.view("m8[s]") + expected = pd.TimedeltaIndex(idx.values.view("m8[s]")) + tm.assert_index_equal(res, expected) + + res2 = idx.view("m8[D]") + expected2 = idx.values.view("m8[D]") + tm.assert_numpy_array_equal(res2, expected2) + + res3 = idx.view("M8[h]") + expected3 = idx.values.view("M8[h]") + tm.assert_numpy_array_equal(res3, expected3) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..376b51dd98bb1b1c7c6c8a67914bc72f6c6c588d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/numeric/test_setops.py @@ -0,0 +1,168 @@ +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +import pandas._testing as tm +from pandas.core.indexes.api import ( + Index, + RangeIndex, +) + + +@pytest.fixture +def index_large(): + # large values used in TestUInt64Index where no compat needed with int64/float64 + large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] + return Index(large, dtype=np.uint64) + + +class TestSetOps: + @pytest.mark.parametrize("dtype", ["f8", "u8", "i8"]) + def test_union_non_numeric(self, dtype): + # corner case, non-numeric + index = Index(np.arange(5, dtype=dtype), dtype=dtype) + assert index.dtype == dtype + + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + def test_intersection(self): + index = Index(range(5), dtype=np.int64) + + other = Index([1, 2, 3, 4, 5]) + result = index.intersection(other) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["int64", "uint64"]) + def test_int_float_union_dtype(self, dtype): + # https://github.com/pandas-dev/pandas/issues/26778 + # [u]int | float -> float + index = Index([0, 2, 3], dtype=dtype) + other = Index([0.5, 1.5], dtype=np.float64) + expected = Index([0.0, 0.5, 1.5, 2.0, 3.0], dtype=np.float64) + result = index.union(other) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + def test_range_float_union_dtype(self): + # https://github.com/pandas-dev/pandas/issues/26778 + index = RangeIndex(start=0, stop=3) + other = Index([0.5, 1.5], dtype=np.float64) + result = index.union(other) + expected = Index([0.0, 0.5, 1, 1.5, 2.0], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + def test_range_uint64_union_dtype(self): + # https://github.com/pandas-dev/pandas/issues/26778 + index = RangeIndex(start=0, stop=3) + other = Index([0, 10], dtype=np.uint64) + result = index.union(other) + expected = Index([0, 1, 2, 10], dtype=object) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + def test_float64_index_difference(self): + # https://github.com/pandas-dev/pandas/issues/35217 + float_index = Index([1.0, 2, 3]) + string_index = Index(["1", "2", "3"]) + + result = float_index.difference(string_index) + tm.assert_index_equal(result, float_index) + + result = string_index.difference(float_index) + tm.assert_index_equal(result, string_index) + + def test_intersection_uint64_outside_int64_range(self, index_large): + other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20]) + result = index_large.intersection(other) + expected = Index(np.sort(np.intersect1d(index_large.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index_large) + expected = Index( + np.sort(np.asarray(np.intersect1d(index_large.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([4, 7, 6, 5, 3], name="index"), True), + (Index([4, 7, 6, 5, 3], name="other"), False), + ], + ) + def test_intersection_monotonic(self, index2, keeps_name, sort): + index1 = Index([5, 3, 2, 4, 1], name="index") + expected = Index([5, 3, 4]) + + if keeps_name: + expected.name = "index" + + result = index1.intersection(index2, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self, sort): + # smoke + index1 = Index([5, 2, 3, 4], name="index1") + index2 = Index([2, 3, 4, 1]) + result = index1.symmetric_difference(index2, sort=sort) + expected = Index([5, 1]) + if sort is not None: + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result, expected.sort_values()) + assert result.name is None + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + +class TestSetOpsSort: + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) + def test_union_sort_other_special(self, slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + + idx = Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) + def test_union_sort_special_true(self, slice_): + idx = Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + + result = idx.union(other, sort=True) + expected = Index([0, 1, 2]) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1ef302c5b51cad92e2c0b1996d4d58d90e5ec6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_astype.py @@ -0,0 +1,33 @@ +import pytest + +from pandas import ( + Index, + NaT, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="object") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype=object) + tm.assert_series_equal(result, expected) + + +def test_astype_invalid_nas_to_tdt64_raises(): + # GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT + idx = Index([NaT.asm8] * 2, dtype=object) + + msg = r"Invalid type for timedelta scalar: " + with pytest.raises(TypeError, match=msg): + idx.astype("m8[ns]") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..ebf9dac715f8d19677b7a60eefb9f65345884782 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/object/test_indexing.py @@ -0,0 +1,233 @@ +from decimal import Decimal + +import numpy as np +import pytest + +from pandas._libs.missing import ( + NA, + is_matching_na, +) +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", np.array([-1, 0, 1, 1], dtype=np.intp)), + ("backfill", np.array([0, 0, 1, -1], dtype=np.intp)), + ], + ) + def test_get_indexer_strings(self, method, expected): + index = Index(["b", "c"]) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, using_infer_string): + index = Index(["b", "c"]) + + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + else: + msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + def test_get_indexer_with_NA_values( + self, unique_nulls_fixture, unique_nulls_fixture2 + ): + # GH#22332 + # check pairwise, that no pair of na values + # is mangled + if unique_nulls_fixture is unique_nulls_fixture2: + return # skip it, values are not unique + arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object) + index = Index(arr, dtype=object) + result = index.get_indexer( + Index( + [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"], dtype=object + ) + ) + expected = np.array([0, 1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + +class TestGetIndexerNonUnique: + def test_get_indexer_non_unique_nas( + self, nulls_fixture, request, using_infer_string + ): + # even though this isn't non-unique, this should still work + if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): + request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) + index = Index(["a", "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", nulls_fixture, "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # matching-but-not-identical nans + if is_matching_na(nulls_fixture, float("NaN")): + index = Index(["a", float("NaN"), "b", float("NaN")]) + match_but_not_identical = True + elif is_matching_na(nulls_fixture, Decimal("NaN")): + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + match_but_not_identical = True + else: + match_but_not_identical = False + + if match_but_not_identical: + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") + def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): + expected_missing = np.array([], dtype=np.intp) + # matching-but-not-identical nats + if is_matching_na(np_nat_fixture, np_nat_fixture2): + # ensure nats are different objects + index = Index( + np.array( + ["2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy()], + dtype=object, + ), + dtype=object, + ) + # pass as index to prevent target from being casted to DatetimeIndex + indexer, missing = index.get_indexer_non_unique( + Index([np_nat_fixture], dtype=object) + ) + expected_indexer = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + # dt64nat vs td64nat + else: + try: + np_nat_fixture == np_nat_fixture2 + except (TypeError, OverflowError): + # Numpy will raise on uncomparable types, like + # np.datetime64('NaT', 'Y') and np.datetime64('NaT', 'ps') + # https://github.com/numpy/numpy/issues/22762 + return + index = Index( + np.array( + [ + "2021-10-02", + np_nat_fixture, + np_nat_fixture2, + np_nat_fixture, + np_nat_fixture2, + ], + dtype=object, + ), + dtype=object, + ) + # pass as index to prevent target from being casted to DatetimeIndex + indexer, missing = index.get_indexer_non_unique( + Index([np_nat_fixture], dtype=object) + ) + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=dtype) + tm.assert_index_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self): + index = Index(["a", "a", "b", "c", "d", "d"]) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..d545bfd2fae0f7034c8cb62a8955c8b3c0ba0d6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_astype.py @@ -0,0 +1,151 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndexAsType: + @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) + def test_astype_raises(self, dtype): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D") + msg = "Cannot cast PeriodIndex to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_conversion(self): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") + + result = idx.astype(object) + expected = Index( + [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Index( + [16937] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + expected = Index([str(x) for x in idx], name="idx", dtype=object) + tm.assert_index_equal(result, expected) + + idx = period_range("1990", "2009", freq="Y", name="idx") + result = idx.astype("i8") + tm.assert_index_equal(result, Index(idx.asi8, name="idx")) + tm.assert_numpy_array_equal(result.values, idx.asi8) + + def test_astype_uint(self): + arr = period_range("2000", periods=2, name="idx") + + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint64") + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint32") + + def test_astype_object(self): + idx = PeriodIndex([], freq="M") + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = PeriodIndex(["2011-01", NaT], freq="M") + + exp = np.array([Period("2011-01", freq="M"), NaT], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([Period("2011-01-01", freq="D"), NaT], dtype=object) + idx = PeriodIndex(["2011-01-01", NaT], freq="D") + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + # TODO: de-duplicate this version (from test_ops) with the one above + # (from test_period) + def test_astype_object2(self): + idx = period_range(start="2013-01-01", periods=4, freq="M", name="idx") + expected_list = [ + Period("2013-01-31", freq="M"), + Period("2013-02-28", freq="M"), + Period("2013-03-31", freq="M"), + Period("2013-04-30", freq="M"), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert idx.tolist() == expected_list + + idx = PeriodIndex( + ["2013-01-01", "2013-01-02", "NaT", "2013-01-04"], freq="D", name="idx" + ) + expected_list = [ + Period("2013-01-01", freq="D"), + Period("2013-01-02", freq="D"), + Period("NaT", freq="D"), + Period("2013-01-04", freq="D"), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + for i in [0, 1, 3]: + assert result[i] == expected[i] + assert result[2] is NaT + assert result.name == expected.name + + result_list = idx.tolist() + for i in [0, 1, 3]: + assert result_list[i] == expected_list[i] + assert result_list[2] is NaT + + def test_astype_category(self): + obj = period_range("2000", periods=2, name="idx") + result = obj.astype("category") + expected = CategoricalIndex( + [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")], name="idx" + ) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = period_range("2000", periods=2, name="idx") + result = obj.astype(bool) + expected = Index(np.array([True, True]), name="idx") + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_period_astype_to_timestamp(self, unit): + # GH#55958 + pi = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + + exp = DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ).as_unit(unit) + res = pi.astype(f"datetime64[{unit}, US/Eastern]") + tm.assert_index_equal(res, exp) + assert res.freq == exp.freq diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_insert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..32bbe09d925679579c1ec015b435870d1282e6b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_insert.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestInsert: + @pytest.mark.parametrize("na", [np.nan, NaT, None]) + def test_insert(self, na): + # GH#18295 (test missing) + expected = PeriodIndex(["2017Q1", NaT, "2017Q2", "2017Q3", "2017Q4"], freq="Q") + result = period_range("2017Q1", periods=4, freq="Q").insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_repeat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..fc344b06420d16a436c84a70f45a292cf6045856 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_repeat.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestRepeat: + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + period_range("2000-01-01", periods=3, freq="D"), + period_range("2001-01-01", periods=3, freq="2D"), + PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) + def test_repeat_freqstr(self, index, use_numpy): + # GH#10183 + expected = PeriodIndex([per for per in index for _ in range(3)]) + result = np.repeat(index, 3) if use_numpy else index.repeat(3) + tm.assert_index_equal(result, expected) + assert result.freqstr == index.freqstr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_shift.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..fca3e3a559e1fe2e53571f5af919e9a0c49c4e68 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_shift.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndexShift: + # --------------------------------------------------------------- + # PeriodIndex.shift is used by __add__ and __sub__ + + def test_pi_shift_ndarray(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex( + ["2011-02", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2002", end="12/1/2010") + + tm.assert_index_equal(pi1.shift(0), pi1) + + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2000", end="12/1/2008") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="2/1/2001", end="1/1/2010") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="12/1/2000", end="11/1/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="1/2/2001", end="12/2/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="12/31/2000", end="11/30/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_corner_cases(self): + # GH#9903 + idx = PeriodIndex([], name="xxx", freq="h") + + msg = "`freq` argument is not supported for PeriodIndex.shift" + with pytest.raises(TypeError, match=msg): + # period shift doesn't accept freq + idx.shift(1, freq="h") + + tm.assert_index_equal(idx.shift(0), idx) + tm.assert_index_equal(idx.shift(3), idx) + + idx = PeriodIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(0), idx) + exp = PeriodIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(3), exp) + exp = PeriodIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(-3), exp) + + def test_shift_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(1) + expected = PeriodIndex( + ["2011-02", "2011-03", "NaT", "2011-05"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_shift_gh8083(self): + # test shift for PeriodIndex + # GH#8083 + drange = period_range("20130101", periods=5, freq="D") + result = drange.shift(1) + expected = PeriodIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_periods(self): + # GH #22458 : argument 'n' was deprecated in favor of 'periods' + idx = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5eabef3ee3b3884253b26cb841aefa33f509b807 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..940e13500b346f41a50169d646615931f675fdf4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0f6a055d7fbac02f0e448827990af370aa4fcd2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a88a5763d0cdc01a2da941373fe32bf7b71eaf9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_range.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_range.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52912dc65bff256b8f43889942b3611ace6b64ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_range.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..924cc1e7eac3c9f92c3cf8cb1e81db04ee4da639 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/ranges/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c53acd092484ef7d88d84836934809dd3cfba44 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91ace442c5edcb41cf1d7ab1a3bca43d02368573 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8dfe6fe44d2a2b1ba92f500fb8fec75c60044cbd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0be934eea04a31605f98f1895c689c47c0942d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2175e3a5dae4fd2dded7f1049e2bc730c3aafdae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b49ddc39e1872c70dcc17bb1f294c6ed25fc653 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_scalar_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_scalar_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..986301c18ddae5f42be7732ab57543f6d6f6251f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_scalar_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_searchsorted.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_searchsorted.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..406be155d1c42b9820710e042a13978faf6d47ab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_searchsorted.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bb08af3a626c152bd7106a2de29ac20d1c55b7e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c64633be2808867c2bb5c1cb8d83791b277a0975 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta_range.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta_range.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d56478a79c4a895a67cecf717567e46b23467861 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_timedelta_range.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30021169e71c9a7b3ffd389db5209b1327870416 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffefd98a60b5d76c4a114731c9396c59ca9cd350 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_factorize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_factorize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12b2d8cda61dba2275b5db0595a4c5ab001d49af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_factorize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc0fdf8f9072b9565e11076339059ab19d9fc94c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_insert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_insert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ed20a527a0cd48ff4aa16ed1a971276127ffd32 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_insert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_repeat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_repeat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..319402bff1c5c57db0b44ca0110483d471cbb994 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_repeat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_shift.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_shift.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67338fa907ef56ed2a5072b95d887eae2ae69391 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/__pycache__/test_shift.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..311f2b5c9aa598de68543825d1bde9dc814b469e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -0,0 +1,176 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + NaT, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestTimedeltaIndex: + def test_astype_object(self): + idx = timedelta_range(start="1 days", periods=4, freq="D", name="idx") + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = TimedeltaIndex( + [timedelta(days=1), timedelta(days=2), NaT, timedelta(days=4)], name="idx" + ) + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + NaT, + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") + + result = idx.astype(object) + expected = Index( + [Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Index( + [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + expected = Index([str(x) for x in idx], name="idx", dtype=object) + tm.assert_index_equal(result, expected) + + rng = timedelta_range("1 days", periods=10) + result = rng.astype("i8") + tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_numpy_array_equal(rng.asi8, result.values) + + def test_astype_uint(self): + arr = timedelta_range("1h", periods=2) + + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint64") + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint32") + + def test_astype_timedelta64(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) + + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + idx.astype("timedelta64") + + result = idx.astype("timedelta64[ns]") + tm.assert_index_equal(result, idx) + assert result is not idx + + result = idx.astype("timedelta64[ns]", copy=False) + tm.assert_index_equal(result, idx) + assert result is idx + + def test_astype_to_td64d_raises(self, index_or_series): + # We don't support "D" reso + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64\[D\]. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + td.astype("timedelta64[D]") + + def test_astype_ms_to_s(self, index_or_series): + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + + exp_values = np.asarray(td).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new(exp_values, dtype=exp_values.dtype) + expected = index_or_series(exp_tda) + assert expected.dtype == "m8[s]" + result = td.astype("timedelta64[s]") + tm.assert_equal(result, expected) + + def test_astype_freq_conversion(self): + # pre-2.0 td64 astype converted to float64. now for supported units + # (s, ms, us, ns) this converts to the requested dtype. + # This matches TDA and Series + tdi = timedelta_range("1 Day", periods=30) + + res = tdi.astype("m8[s]") + exp_values = np.asarray(tdi).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new( + exp_values, dtype=exp_values.dtype, freq=tdi.freq + ) + expected = Index(exp_tda) + assert expected.dtype == "m8[s]" + tm.assert_index_equal(res, expected) + + # check this matches Series and TimedeltaArray + res = tdi._data.astype("m8[s]") + tm.assert_equal(res, expected._values) + + res = tdi.to_series().astype("m8[s]") + tm.assert_equal(res._values, expected._values._with_freq(None)) + + @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) + def test_astype_raises(self, dtype): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) + msg = "Cannot cast TimedeltaIndex to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_category(self): + obj = timedelta_range("1h", periods=2, freq="h") + + result = obj.astype("category") + expected = pd.CategoricalIndex([Timedelta("1h"), Timedelta("2h")]) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = timedelta_range("1h", periods=2) + result = obj.astype(bool) + expected = Index(np.array([True, True])) + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py new file mode 100644 index 0000000000000000000000000000000000000000..24ab3888412d08b54543ed22910c67ce9bdf328f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py @@ -0,0 +1,40 @@ +import numpy as np + +from pandas import ( + TimedeltaIndex, + factorize, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexFactorize: + def test_factorize(self): + idx1 = TimedeltaIndex(["1 day", "1 day", "2 day", "2 day", "3 day", "3 day"]) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = TimedeltaIndex(["1 day", "2 day", "3 day"]) + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved + idx3 = timedelta_range("1 day", periods=4, freq="s") + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..40aa95d0a46058d2dc3fc5208ca39328d96b23fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py @@ -0,0 +1,22 @@ +from pandas import ( + Index, + NaT, + Timedelta, + TimedeltaIndex, +) +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_timedelta(self): + # GH#11343 + idx = TimedeltaIndex(["1 day", NaT, "3 day"]) + + exp = TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(Timedelta("2 day")), exp) + + exp = TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(Timedelta("3 hour")) + + exp = Index([Timedelta("1 day"), "x", Timedelta("3 day")], dtype=object) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..f8164102815f61ec61962524db2a2b3dd0ff6d55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py @@ -0,0 +1,145 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas._libs import lib + +import pandas as pd +from pandas import ( + Index, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexInsert: + def test_insert(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") + + # preserve freq + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) + + # reset freq to None + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): + # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + item = np.datetime64("NaT") + result = idx.insert(0, item) + + expected = Index([item] + list(idx), dtype=object, name="idx") + tm.assert_index_equal(result, expected) + + # Also works if we pass a different dt64nat object + item2 = np.datetime64("NaT") + result = idx.insert(0, item2) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "item", [0, np.int64(0), np.float64(0), np.array(0), np.datetime64(456, "us")] + ) + def test_insert_mismatched_types_raises(self, item): + # GH#33703 dont cast these to td64 + tdi = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = tdi.insert(1, item) + + expected = Index( + [tdi[0], lib.item_from_zerodim(item)] + list(tdi[1:]), + dtype=object, + name="idx", + ) + tm.assert_index_equal(result, expected) + + def test_insert_castable_str(self): + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + + expected = TimedeltaIndex([idx[0]] + list(idx)) + tm.assert_index_equal(result, expected) + + def test_insert_non_castable_str(self): + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "foo") + + expected = Index(["foo"] + list(idx), dtype=object) + tm.assert_index_equal(result, expected) + + def test_insert_empty(self): + # Corner case inserting with length zero doesn't raise IndexError + # GH#33573 for freq preservation + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + result = idx[:0].insert(0, td) + assert result.freq == "D" + + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(1, td) + + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(-1, td) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..2a9b58d1bf322938e9344d0cbacfaa79674fcf0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py @@ -0,0 +1,34 @@ +import numpy as np + +from pandas import ( + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat(self): + index = timedelta_range("1 days", periods=2, freq="D") + exp = TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..a0986d1496881a2061ac8306d0a064f4393cd4e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -0,0 +1,76 @@ +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import TimedeltaIndex +import pandas._testing as tm + + +class TestTimedeltaIndexShift: + # ------------------------------------------------------------- + # TimedeltaIndex.shift is used by __add__/__sub__ + + def test_tdi_shift_empty(self): + # GH#9903 + idx = TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) + + def test_tdi_shift_hours(self): + # GH#9903 + idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + exp = TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="h"), exp) + exp = TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) + + def test_tdi_shift_minutes(self): + # GH#9903 + idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="min"), idx) + exp = TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="min"), exp) + exp = TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="min"), exp) + + def test_tdi_shift_int(self): + # GH#8083 + tdi = pd.to_timedelta(range(5), unit="d") + trange = tdi._with_freq("infer") + pd.offsets.Hour(1) + result = trange.shift(1) + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_tdi_shift_nonstandard_freq(self): + # GH#8083 + tdi = pd.to_timedelta(range(5), unit="d") + trange = tdi._with_freq("infer") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_no_freq(self): + # GH#19147 + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): + tdi.shift(2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..0510700bb64d7a626761a67d72ecfa6ecfba9ac4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py @@ -0,0 +1,291 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Timedelta, + TimedeltaIndex, + timedelta_range, + to_timedelta, +) +import pandas._testing as tm +from pandas.core.arrays.timedeltas import TimedeltaArray + + +class TestTimedeltaIndex: + def test_closed_deprecated(self): + # GH#52628 + msg = "The 'closed' keyword" + with tm.assert_produces_warning(FutureWarning, match=msg): + TimedeltaIndex([], closed=True) + + def test_array_of_dt64_nat_raises(self): + # GH#39462 + nat = np.datetime64("NaT", "ns") + arr = np.array([nat], dtype=object) + + msg = "Invalid type for timedelta scalar" + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(arr) + + with pytest.raises(TypeError, match=msg): + TimedeltaArray._from_sequence(arr, dtype="m8[ns]") + + with pytest.raises(TypeError, match=msg): + to_timedelta(arr) + + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) + def test_unit_m_y_raises(self, unit): + msg = "Units 'M', 'Y', and 'y' are no longer supported" + depr_msg = "The 'unit' keyword in TimedeltaIndex construction is deprecated" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaIndex([1, 3, 7], unit) + + def test_int64_nocopy(self): + # GH#23539 check that a copy isn't made when we pass int64 data + # and copy=False + arr = np.arange(10, dtype=np.int64) + tdi = TimedeltaIndex(arr, copy=False) + assert tdi._data._ndarray.base is arr + + def test_infer_from_tdi(self): + # GH#23539 + # fast-path for inferring a frequency if the passed data already + # has one + tdi = timedelta_range("1 second", periods=10**7, freq="1s") + + result = TimedeltaIndex(tdi, freq="infer") + assert result.freq == tdi.freq + + # check that inferred_freq was not called by checking that the + # value has not been cached + assert "inferred_freq" not in getattr(result, "_cache", {}) + + def test_infer_from_tdi_mismatch(self): + # GH#23539 + # fast-path for invalidating a frequency if the passed data already + # has one and it does not match the `freq` input + tdi = timedelta_range("1 second", periods=100, freq="1s") + + depr_msg = "TimedeltaArray.__init__ is deprecated" + msg = ( + "Inferred frequency .* from passed values does " + "not conform to passed frequency" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(tdi, freq="D") + + with pytest.raises(ValueError, match=msg): + # GH#23789 + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi, freq="D") + + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(tdi._data, freq="D") + + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi._data, freq="D") + + def test_dt64_data_invalid(self): + # GH#23539 + # passing tz-aware DatetimeIndex raises, naive or ndarray[datetime64] + # raise as of GH#29794 + dti = pd.date_range("2016-01-01", periods=3) + + msg = "cannot be converted to timedelta64" + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(dti.tz_localize("Europe/Brussels")) + + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(dti) + + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(np.asarray(dti)) + + def test_float64_ns_rounded(self): + # GH#23539 without specifying a unit, floats are regarded as nanos, + # and fractional portions are truncated + tdi = TimedeltaIndex([2.3, 9.7]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # integral floats are non-lossy + tdi = TimedeltaIndex([2.0, 9.0]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # NaNs get converted to NaT + tdi = TimedeltaIndex([2.0, np.nan]) + expected = TimedeltaIndex([Timedelta(nanoseconds=2), pd.NaT]) + tm.assert_index_equal(tdi, expected) + + def test_float64_unit_conversion(self): + # GH#23539 + tdi = to_timedelta([1.5, 2.25], unit="D") + expected = TimedeltaIndex([Timedelta(days=1.5), Timedelta(days=2.25)]) + tm.assert_index_equal(tdi, expected) + + def test_construction_base_constructor(self): + arr = [Timedelta("1 days"), pd.NaT, Timedelta("3 days")] + tm.assert_index_equal(pd.Index(arr), TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, Timedelta("1 days")] + tm.assert_index_equal(pd.Index(arr), TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), TimedeltaIndex(np.array(arr))) + + @pytest.mark.filterwarnings( + "ignore:The 'unit' keyword in TimedeltaIndex construction:FutureWarning" + ) + def test_constructor(self): + expected = TimedeltaIndex( + [ + "1 days", + "1 days 00:00:05", + "2 days", + "2 days 00:00:02", + "0 days 00:00:03", + ] + ) + result = TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + timedelta(days=2, seconds=2), + pd.offsets.Second(3), + ] + ) + tm.assert_index_equal(result, expected) + + expected = TimedeltaIndex( + ["0 days 00:00:00", "0 days 00:00:01", "0 days 00:00:02"] + ) + result = TimedeltaIndex(range(3), unit="s") + tm.assert_index_equal(result, expected) + expected = TimedeltaIndex( + ["0 days 00:00:00", "0 days 00:00:05", "0 days 00:00:09"] + ) + result = TimedeltaIndex([0, 5, 9], unit="s") + tm.assert_index_equal(result, expected) + expected = TimedeltaIndex( + ["0 days 00:00:00.400", "0 days 00:00:00.450", "0 days 00:00:01.200"] + ) + result = TimedeltaIndex([400, 450, 1200], unit="ms") + tm.assert_index_equal(result, expected) + + def test_constructor_iso(self): + # GH #21877 + expected = timedelta_range("1s", periods=9, freq="s") + durations = [f"P0DT0H0M{i}S" for i in range(1, 10)] + result = to_timedelta(durations) + tm.assert_index_equal(result, expected) + + def test_timedelta_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = timedelta_range("1 days", periods=10.5) + exp = timedelta_range("1 days", periods=10) + tm.assert_index_equal(rng, exp) + + def test_constructor_coverage(self): + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + timedelta_range(start="1 days", periods="foo", freq="D") + + msg = ( + r"TimedeltaIndex\(\.\.\.\) must be called with a collection of some kind, " + "'1 days' was passed" + ) + with pytest.raises(TypeError, match=msg): + TimedeltaIndex("1 days") + + # generator expression + gen = (timedelta(i) for i in range(10)) + result = TimedeltaIndex(gen) + expected = TimedeltaIndex([timedelta(i) for i in range(10)]) + tm.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(["1 days", "2 days", "3 days"]) + result = TimedeltaIndex(strings) + expected = to_timedelta([1, 2, 3], unit="d") + tm.assert_index_equal(result, expected) + + from_ints = TimedeltaIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # non-conforming freq + msg = ( + "Inferred frequency None from passed values does not conform to " + "passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(["1 days", "2 days", "4 days"], freq="D") + + msg = ( + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" + ) + with pytest.raises(ValueError, match=msg): + timedelta_range(periods=10, freq="D") + + def test_constructor_name(self): + idx = timedelta_range(start="1 days", periods=1, freq="D", name="TEST") + assert idx.name == "TEST" + + # GH10025 + idx2 = TimedeltaIndex(idx, name="something else") + assert idx2.name == "something else" + + def test_constructor_no_precision_raises(self): + # GH-24753, GH-24739 + + msg = "with no precision is not allowed" + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(["2000"], dtype="timedelta64") + + msg = "The 'timedelta64' dtype has no unit. Please pass in" + with pytest.raises(ValueError, match=msg): + pd.Index(["2000"], dtype="timedelta64") + + def test_constructor_wrong_precision_raises(self): + msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'" + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(["2000"], dtype="timedelta64[D]") + + # "timedelta64[us]" was unsupported pre-2.0, but now this works. + tdi = TimedeltaIndex(["2000"], dtype="timedelta64[us]") + assert tdi.dtype == "m8[us]" + + def test_explicit_none_freq(self): + # Explicitly passing freq=None is respected + tdi = timedelta_range(1, periods=5) + assert tdi.freq is not None + + result = TimedeltaIndex(tdi, freq=None) + assert result.freq is None + + result = TimedeltaIndex(tdi._data, freq=None) + assert result.freq is None + + msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tda = TimedeltaArray(tdi, freq=None) + assert tda.freq is None + + def test_from_categorical(self): + tdi = timedelta_range(1, periods=5) + + cat = pd.Categorical(tdi) + + result = TimedeltaIndex(cat) + tm.assert_index_equal(result, tdi) + + ci = pd.CategoricalIndex(tdi) + result = TimedeltaIndex(ci) + tm.assert_index_equal(result, tdi) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_delete.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_delete.py new file mode 100644 index 0000000000000000000000000000000000000000..6e6f54702ce1a09c0fccb0c44d0cd4a474c46a8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_delete.py @@ -0,0 +1,71 @@ +from pandas import ( + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexDelete: + def test_delete(self): + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") + + # preserve freq + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") + + # reset freq to None + expected_1 = TimedeltaIndex( + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with tm.external_error_raised((IndexError, ValueError)): + # either depending on numpy version + idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_freq_attr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_freq_attr.py new file mode 100644 index 0000000000000000000000000000000000000000..1912c49d3000fcbef45dd081213778bfb387e38e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_freq_attr.py @@ -0,0 +1,72 @@ +import pytest + +from pandas import TimedeltaIndex + +from pandas.tseries.offsets import ( + DateOffset, + Day, + Hour, + MonthEnd, +) + + +class TestFreq: + @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48h", Hour(48)]) + def test_freq_setter(self, values, freq): + # GH#20678 + idx = TimedeltaIndex(values) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, DateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_with_freq_empty_requires_tick(self): + idx = TimedeltaIndex([]) + + off = MonthEnd(1) + msg = "TimedeltaArray/Index freq must be a Tick" + with pytest.raises(TypeError, match=msg): + idx._with_freq(off) + with pytest.raises(TypeError, match=msg): + idx._data._with_freq(off) + + def test_freq_setter_errors(self): + # GH#20678 + idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with a non-fixed frequency + msg = r"<2 \* BusinessDays> is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + idx._data.freq = "2B" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" + + def test_freq_view_safe(self): + # Setting the freq for one TimedeltaIndex shouldn't alter the freq + # for another that views the same data + + tdi = TimedeltaIndex(["0 days", "2 days", "4 days"], freq="2D") + tda = tdi._data + + tdi2 = TimedeltaIndex(tda)._with_freq(None) + assert tdi2.freq is None + + # Original was not altered + assert tdi.freq == "2D" + assert tda.freq == "2D" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..397f9d9e183319f6df9335fbae7f8cb7401d6ac1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py @@ -0,0 +1,347 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas import ( + Index, + NaT, + Timedelta, + TimedeltaIndex, + Timestamp, + notna, + offsets, + timedelta_range, + to_timedelta, +) +import pandas._testing as tm + + +class TestGetItem: + def test_getitem_slice_keeps_name(self): + # GH#4226 + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") + assert tdi[1:].name == tdi.name + + def test_getitem(self): + idx1 = timedelta_range("1 day", "31 day", freq="D", name="idx") + + for idx in [idx1]: + result = idx[0] + assert result == Timedelta("1 day") + + result = idx[0:5] + expected = timedelta_range("1 day", "5 day", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[0:10:2] + expected = timedelta_range("1 day", "9 day", freq="2D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[-20:-5:3] + expected = timedelta_range("12 day", "24 day", freq="3D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[4::-1] + expected = TimedeltaIndex( + ["5 day", "4 day", "3 day", "2 day", "1 day"], freq="-1D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "key", + [ + Timestamp("1970-01-01"), + Timestamp("1970-01-02"), + datetime(1970, 1, 1), + Timestamp("1970-01-03").to_datetime64(), + # non-matching NA values + np.datetime64("NaT"), + ], + ) + def test_timestamp_invalid_key(self, key): + # GH#20464 + tdi = timedelta_range(0, periods=10) + with pytest.raises(KeyError, match=re.escape(repr(key))): + tdi.get_loc(key) + + +class TestGetLoc: + def test_get_loc_key_unit_mismatch(self): + idx = to_timedelta(["0 days", "1 days", "2 days"]) + key = idx[1].as_unit("ms") + loc = idx.get_loc(key) + assert loc == 1 + + def test_get_loc_key_unit_mismatch_not_castable(self): + tdi = to_timedelta(["0 days", "1 days", "2 days"]).astype("m8[s]") + assert tdi.dtype == "m8[s]" + key = tdi[0].as_unit("ns") + Timedelta(1) + + with pytest.raises(KeyError, match=r"Timedelta\('0 days 00:00:00.000000001'\)"): + tdi.get_loc(key) + + assert key not in tdi + + def test_get_loc(self): + idx = to_timedelta(["0 days", "1 days", "2 days"]) + + # GH 16909 + assert idx.get_loc(idx[1].to_timedelta64()) == 1 + + # GH 16896 + assert idx.get_loc("0 days") == 0 + + def test_get_loc_nat(self): + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) + + assert tidx.get_loc(NaT) == 1 + assert tidx.get_loc(None) == 1 + assert tidx.get_loc(float("nan")) == 1 + assert tidx.get_loc(np.nan) == 1 + + +class TestGetIndexer: + def test_get_indexer(self): + idx = to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) + + +class TestWhere: + def test_where_doesnt_retain_freq(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + cond = [True, True, False] + expected = TimedeltaIndex([tdi[0], tdi[1], tdi[0]], freq=None, name="idx") + + result = tdi.where(cond, tdi[::-1]) + tm.assert_index_equal(result, expected) + + def test_where_invalid_dtypes(self, fixed_now_ts): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + + tail = tdi[2:].tolist() + i2 = Index([NaT, NaT] + tail) + mask = notna(i2) + + expected = Index([NaT._value, NaT._value] + tail, dtype=object, name="idx") + assert isinstance(expected[0], int) + result = tdi.where(mask, i2.asi8) + tm.assert_index_equal(result, expected) + + ts = i2 + fixed_now_ts + expected = Index([ts[0], ts[1]] + tail, dtype=object, name="idx") + result = tdi.where(mask, ts) + tm.assert_index_equal(result, expected) + + per = (i2 + fixed_now_ts).to_period("D") + expected = Index([per[0], per[1]] + tail, dtype=object, name="idx") + result = tdi.where(mask, per) + tm.assert_index_equal(result, expected) + + ts = fixed_now_ts + expected = Index([ts, ts] + tail, dtype=object, name="idx") + result = tdi.where(mask, ts) + tm.assert_index_equal(result, expected) + + def test_where_mismatched_nat(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + cond = np.array([True, False, False]) + + dtnat = np.datetime64("NaT", "ns") + expected = Index([tdi[0], dtnat, dtnat], dtype=object, name="idx") + assert expected[2] is dtnat + result = tdi.where(cond, dtnat) + tm.assert_index_equal(result, expected) + + +class TestTake: + def test_take(self): + # GH 10295 + idx1 = timedelta_range("1 day", "31 day", freq="D", name="idx") + + for idx in [idx1]: + result = idx.take([0]) + assert result == Timedelta("1 day") + + result = idx.take([-1]) + assert result == Timedelta("31 day") + + result = idx.take([0, 1, 2]) + expected = timedelta_range("1 day", "3 day", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = timedelta_range("1 day", "5 day", freq="2D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([7, 4, 1]) + expected = timedelta_range("8 day", "2 day", freq="-3D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([3, 2, 5]) + expected = TimedeltaIndex(["4 day", "3 day", "6 day"], name="idx") + tm.assert_index_equal(result, expected) + assert result.freq is None + + result = idx.take([-3, 2, 5]) + expected = TimedeltaIndex(["29 day", "3 day", "6 day"], name="idx") + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_take_invalid_kwargs(self): + idx = timedelta_range("1 day", "31 day", freq="D", name="idx") + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + def test_take_equiv_getitem(self): + tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] + idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") + expected = TimedeltaIndex(tds, freq=None, name="idx") + + taken1 = idx.take([2, 4, 10]) + taken2 = idx[[2, 4, 10]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, TimedeltaIndex) + assert taken.freq is None + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH 12631 + idx = TimedeltaIndex(["1 days", "2 days", "3 days"], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = TimedeltaIndex(["2 days", "1 days", "3 days"], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = TimedeltaIndex(["2 days", "1 days", "NaT"], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = TimedeltaIndex(["2 days", "1 days", "3 days"], name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestMaybeCastSliceBound: + @pytest.fixture(params=["increasing", "decreasing", None]) + def monotonic(self, request): + return request.param + + @pytest.fixture + def tdi(self, monotonic): + tdi = timedelta_range("1 Day", periods=10) + if monotonic == "decreasing": + tdi = tdi[::-1] + elif monotonic is None: + taker = np.arange(10, dtype=np.intp) + np.random.default_rng(2).shuffle(taker) + tdi = tdi.take(taker) + return tdi + + def test_maybe_cast_slice_bound_invalid_str(self, tdi): + # test the low-level _maybe_cast_slice_bound and that we get the + # expected exception+message all the way up the stack + msg = ( + "cannot do slice indexing on TimedeltaIndex with these " + r"indexers \[foo\] of type str" + ) + with pytest.raises(TypeError, match=msg): + tdi._maybe_cast_slice_bound("foo", side="left") + with pytest.raises(TypeError, match=msg): + tdi.get_slice_bound("foo", side="left") + with pytest.raises(TypeError, match=msg): + tdi.slice_locs("foo", None, None) + + def test_slice_invalid_str_with_timedeltaindex( + self, tdi, frame_or_series, indexer_sl + ): + obj = frame_or_series(range(10), index=tdi) + + msg = ( + "cannot do slice indexing on TimedeltaIndex with these " + r"indexers \[foo\] of type str" + ) + with pytest.raises(TypeError, match=msg): + indexer_sl(obj)["foo":] + with pytest.raises(TypeError, match=msg): + indexer_sl(obj)["foo":-1] + with pytest.raises(TypeError, match=msg): + indexer_sl(obj)[:"foo"] + with pytest.raises(TypeError, match=msg): + indexer_sl(obj)[tdi[0] : "foo"] + + +class TestContains: + def test_contains_nonunique(self): + # GH#9512 + for vals in ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["00:01:00", "00:01:00", "00:02:00"], + ["00:01:00", "00:01:00", "00:00:01"], + ): + idx = TimedeltaIndex(vals) + assert idx[0] in idx + + def test_contains(self): + # Checking for any NaT-like objects + # GH#13603 + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) + for v in [NaT, None, float("nan"), np.nan]: + assert v not in td + + td = to_timedelta([NaT]) + for v in [NaT, None, float("nan"), np.nan]: + assert v in td diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd7a5de71b10a6004cd7a3f798fecd8e7631750 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_join.py @@ -0,0 +1,47 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Timedelta, + timedelta_range, +) +import pandas._testing as tm + + +class TestJoin: + def test_append_join_nondatetimeindex(self): + rng = timedelta_range("1 days", periods=10) + idx = Index(["a", "b", "c", "d"]) + + result = rng.append(idx) + assert isinstance(result[0], Timedelta) + + # it works + rng.join(idx, how="outer") + + def test_join_self(self, join_type): + index = timedelta_range("1 day", periods=10) + joined = index.join(index, how=join_type) + tm.assert_index_equal(index, joined) + + def test_does_not_convert_mixed_integer(self): + df = DataFrame(np.ones((5, 5)), columns=timedelta_range("1 day", periods=5)) + + cols = df.columns.join(df.index, how="outer") + joined = cols.join(df.columns) + assert cols.dtype == np.dtype("O") + assert cols.dtype == joined.dtype + tm.assert_index_equal(cols, joined) + + def test_join_preserves_freq(self): + # GH#32157 + tdi = timedelta_range("1 day", periods=10) + result = tdi[:5].join(tdi[5:], how="outer") + assert result.freq == tdi.freq + tm.assert_index_equal(result, tdi) + + result = tdi[:5].join(tdi[6:], how="outer") + assert result.freq is None + expected = tdi.delete(5) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_pickle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..befe709728bdd4e9fac3c626f4e33986d671c86d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_pickle.py @@ -0,0 +1,11 @@ +from pandas import timedelta_range +import pandas._testing as tm + + +class TestPickle: + def test_pickle_after_set_freq(self): + tdi = timedelta_range("1 day", periods=4, freq="s") + tdi = tdi._with_freq(None) + + res = tm.round_trip_pickle(tdi) + tm.assert_index_equal(res, tdi) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..9f0552f8baa901addaae9b4ca0890f6edb272715 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -0,0 +1,142 @@ +""" +Tests for TimedeltaIndex methods behaving like their Timedelta counterparts +""" + +import numpy as np +import pytest + +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG + +from pandas import ( + Index, + Series, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestVectorizedTimedelta: + def test_tdi_total_seconds(self): + # GH#10939 + # test index + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + expt = [ + 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456.0 / 1e9, + ] + tm.assert_almost_equal(rng.total_seconds(), Index(expt)) + + # test Series + ser = Series(rng) + s_expt = Series(expt, index=[0, 1]) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + # with nat + ser[1] = np.nan + s_expt = Series( + [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, np.nan], + index=[0, 1], + ) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + def test_tdi_total_seconds_all_nat(self): + # with both nat + ser = Series([np.nan, np.nan], dtype="timedelta64[ns]") + result = ser.dt.total_seconds() + expected = Series([np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + def test_tdi_round(self): + td = timedelta_range(start="16801 days", periods=5, freq="30Min") + elt = td[1] + + expected_rng = TimedeltaIndex( + [ + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 01:00:00"), + Timedelta("16801 days 02:00:00"), + Timedelta("16801 days 02:00:00"), + ] + ) + expected_elt = expected_rng[1] + + tm.assert_index_equal(td.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + td.round(freq="foo") + with pytest.raises(ValueError, match=msg): + elt.round(freq="foo") + + msg = " is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + td.round(freq="ME") + with pytest.raises(ValueError, match=msg): + elt.round(freq="ME") + + @pytest.mark.parametrize( + "freq,msg", + [ + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_tdi_round_invalid(self, freq, msg): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + + with pytest.raises(ValueError, match=msg): + t1.round(freq) + with pytest.raises(ValueError, match=msg): + # Same test for TimedeltaArray + t1._data.round(freq) + + # TODO: de-duplicate with test_tdi_round + def test_round(self): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + t2 = -1 * t1 + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") + + # note that negative times round DOWN! so don't give whole numbers + for freq, s1, s2 in [ + ("ns", t1, t2), + ("us", t1, t2), + ( + "ms", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), + ), + ( + "s", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), + ), + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), + ]: + r1 = t1.round(freq) + tm.assert_index_equal(r1, s1) + r2 = t2.round(freq) + tm.assert_index_equal(r2, s2) + + def test_components(self): + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") + rng.components + + # with nat + s = Series(rng) + s[1] = np.nan + + result = s.dt.components + assert not result.iloc[0].isna().all() + assert result.iloc[1].isna().all() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_searchsorted.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_searchsorted.py new file mode 100644 index 0000000000000000000000000000000000000000..710571ef383970097985f44e09ecba77fcf63f74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_searchsorted.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest + +from pandas import ( + TimedeltaIndex, + Timestamp, +) +import pandas._testing as tm + + +class TestSearchSorted: + def test_searchsorted_different_argument_classes(self, listlike_box): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + result = idx.searchsorted(listlike_box(idx)) + expected = np.arange(len(idx), dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = idx._data.searchsorted(listlike_box(idx)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2] + ) + def test_searchsorted_invalid_argument_dtype(self, arg): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + msg = "value should be a 'Timedelta', 'NaT', or array of those. Got" + with pytest.raises(TypeError, match=msg): + idx.searchsorted(arg) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_setops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..fce10d9176d7438e63a5e46eede1bb96b41bb8bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_setops.py @@ -0,0 +1,254 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import Hour + + +class TestTimedeltaIndex: + def test_union(self): + i1 = timedelta_range("1day", periods=5) + i2 = timedelta_range("3day", periods=5) + result = i1.union(i2) + expected = timedelta_range("1day", periods=7) + tm.assert_index_equal(result, expected) + + i1 = Index(np.arange(0, 20, 2, dtype=np.int64)) + i2 = timedelta_range(start="1 day", periods=10, freq="D") + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_union_sort_false(self): + tdi = timedelta_range("1day", periods=5) + + left = tdi[3:] + right = tdi[:3] + + # Check that we are testing the desired code path + assert left._can_fast_union(right) + + result = left.union(right) + tm.assert_index_equal(result, tdi) + + result = left.union(right, sort=False) + expected = TimedeltaIndex(["4 Days", "5 Days", "1 Days", "2 Day", "3 Days"]) + tm.assert_index_equal(result, expected) + + def test_union_coverage(self): + idx = TimedeltaIndex(["3d", "1d", "2d"]) + ordered = TimedeltaIndex(idx.sort_values(), freq="infer") + result = ordered.union(idx) + tm.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered) + tm.assert_index_equal(result, ordered) + assert result.freq == ordered.freq + + def test_union_bug_1730(self): + rng_a = timedelta_range("1 day", periods=4, freq="3h") + rng_b = timedelta_range("1 day", periods=4, freq="4h") + + result = rng_a.union(rng_b) + exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) + tm.assert_index_equal(result, exp) + + def test_union_bug_1745(self): + left = TimedeltaIndex(["1 day 15:19:49.695000"]) + right = TimedeltaIndex( + ["2 day 13:04:21.322000", "1 day 15:27:24.873000", "1 day 15:31:05.350000"] + ) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(left) | set(right))) + tm.assert_index_equal(result, exp) + + def test_union_bug_4564(self): + left = timedelta_range("1 day", "30d") + right = left + pd.offsets.Minute(15) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(left) | set(right))) + tm.assert_index_equal(result, exp) + + def test_union_freq_infer(self): + # When taking the union of two TimedeltaIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # DatetimeIndex behavior. + tdi = timedelta_range("1 Day", periods=5) + left = tdi[[0, 1, 3, 4]] + right = tdi[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, tdi) + assert result.freq == "D" + + def test_intersection_bug_1708(self): + index_1 = timedelta_range("1 day", periods=4, freq="h") + index_2 = index_1 + pd.offsets.Hour(5) + + result = index_1.intersection(index_2) + assert len(result) == 0 + + index_1 = timedelta_range("1 day", periods=4, freq="h") + index_2 = index_1 + pd.offsets.Hour(1) + + result = index_1.intersection(index_2) + expected = timedelta_range("1 day 01:00:00", periods=3, freq="h") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + def test_intersection_equal(self, sort): + # GH 24471 Test intersection outcome given the sort keyword + # for equal indices intersection should return the original index + first = timedelta_range("1 day", periods=4, freq="h") + second = timedelta_range("1 day", periods=4, freq="h") + intersect = first.intersection(second, sort=sort) + if sort is None: + tm.assert_index_equal(intersect, second.sort_values()) + tm.assert_index_equal(intersect, second) + + # Corner cases + inter = first.intersection(first, sort=sort) + assert inter is first + + @pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)]) + def test_intersection_zero_length(self, period_1, period_2, sort): + # GH 24471 test for non overlap the intersection should be zero length + index_1 = timedelta_range("1 day", periods=period_1, freq="h") + index_2 = timedelta_range("1 day", periods=period_2, freq="h") + expected = timedelta_range("1 day", periods=0, freq="h") + result = index_1.intersection(index_2, sort=sort) + tm.assert_index_equal(result, expected) + + def test_zero_length_input_index(self, sort): + # GH 24966 test for 0-len intersections are copied + index_1 = timedelta_range("1 day", periods=0, freq="h") + index_2 = timedelta_range("1 day", periods=3, freq="h") + result = index_1.intersection(index_2, sort=sort) + assert index_1 is not result + assert index_2 is not result + tm.assert_copy(result, index_1) + + @pytest.mark.parametrize( + "rng, expected", + # if target has the same name, it is preserved + [ + ( + timedelta_range("1 day", periods=5, freq="h", name="idx"), + timedelta_range("1 day", periods=4, freq="h", name="idx"), + ), + # if target name is different, it will be reset + ( + timedelta_range("1 day", periods=5, freq="h", name="other"), + timedelta_range("1 day", periods=4, freq="h", name=None), + ), + # if no overlap exists return empty index + ( + timedelta_range("1 day", periods=10, freq="h", name="idx")[5:], + TimedeltaIndex([], freq="h", name="idx"), + ), + ], + ) + def test_intersection(self, rng, expected, sort): + # GH 4690 (with tz) + base = timedelta_range("1 day", periods=4, freq="h", name="idx") + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "rng, expected", + # part intersection works + [ + ( + TimedeltaIndex(["5 hour", "2 hour", "4 hour", "9 hour"], name="idx"), + TimedeltaIndex(["2 hour", "4 hour"], name="idx"), + ), + # reordered part intersection + ( + TimedeltaIndex(["2 hour", "5 hour", "5 hour", "1 hour"], name="other"), + TimedeltaIndex(["1 hour", "2 hour"], name=None), + ), + # reversed index + ( + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx")[ + ::-1 + ], + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx"), + ), + ], + ) + def test_intersection_non_monotonic(self, rng, expected, sort): + # 24471 non-monotonic + base = TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx") + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + # if reversed order, frequency is still the same + if all(base == rng[::-1]) and sort is None: + assert isinstance(result.freq, Hour) + else: + assert result.freq is None + + +class TestTimedeltaIndexDifference: + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + # preserve frequency when the difference is a contiguous + # subset of the original range + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + def test_difference_sort(self, sort): + index = TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py new file mode 100644 index 0000000000000000000000000000000000000000..f22bdb7a90516a7162ebdb1cc2d8cbfd9531b9e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -0,0 +1,173 @@ +import numpy as np +import pytest + +from pandas import ( + Timedelta, + TimedeltaIndex, + timedelta_range, + to_timedelta, +) +import pandas._testing as tm + +from pandas.tseries.offsets import ( + Day, + Second, +) + + +class TestTimedeltas: + def test_timedelta_range_unit(self): + # GH#49824 + tdi = timedelta_range("0 Days", periods=10, freq="100000D", unit="s") + exp_arr = (np.arange(10, dtype="i8") * 100_000).view("m8[D]").astype("m8[s]") + tm.assert_numpy_array_equal(tdi.to_numpy(), exp_arr) + + def test_timedelta_range(self): + expected = to_timedelta(np.arange(5), unit="D") + result = timedelta_range("0 days", periods=5, freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(11), unit="D") + result = timedelta_range("0 days", "10 days", freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(5), unit="D") + Second(2) + Day() + result = timedelta_range("1 days, 00:00:02", "5 days, 00:00:02", freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta([1, 3, 5, 7, 9], unit="D") + Second(2) + result = timedelta_range("1 days, 00:00:02", periods=5, freq="2D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(50), unit="min") * 30 + result = timedelta_range("0 days", freq="30min", periods=50) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "depr_unit, unit", + [ + ("H", "hour"), + ("T", "minute"), + ("t", "minute"), + ("S", "second"), + ("L", "millisecond"), + ("l", "millisecond"), + ("U", "microsecond"), + ("u", "microsecond"), + ("N", "nanosecond"), + ("n", "nanosecond"), + ], + ) + def test_timedelta_units_H_T_S_L_U_N_deprecated(self, depr_unit, unit): + # GH#52536 + depr_msg = ( + f"'{depr_unit}' is deprecated and will be removed in a future version." + ) + + expected = to_timedelta(np.arange(5), unit=unit) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = to_timedelta(np.arange(5), unit=depr_unit) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "periods, freq", [(3, "2D"), (5, "D"), (6, "19h12min"), (7, "16h"), (9, "12h")] + ) + def test_linspace_behavior(self, periods, freq): + # GH 20976 + result = timedelta_range(start="0 days", end="4 days", periods=periods) + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("msg_freq, freq", [("H", "19H12min"), ("T", "19h12T")]) + def test_timedelta_range_H_T_deprecated(self, freq, msg_freq): + # GH#52536 + msg = f"'{msg_freq}' is deprecated and will be removed in a future version." + + result = timedelta_range(start="0 days", end="4 days", periods=6) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days") + + with pytest.raises(ValueError, match=msg): + timedelta_range(end="5 days") + + with pytest.raises(ValueError, match=msg): + timedelta_range(periods=2) + + with pytest.raises(ValueError, match=msg): + timedelta_range() + + # too many params + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="5 days", periods=10, freq="h") + + @pytest.mark.parametrize( + "start, end, freq, expected_periods", + [ + ("1D", "10D", "2D", (10 - 1) // 2 + 1), + ("2D", "30D", "3D", (30 - 2) // 3 + 1), + ("2s", "50s", "5s", (50 - 2) // 5 + 1), + # tests that worked before GH 33498: + ("4D", "16D", "3D", (16 - 4) // 3 + 1), + ("8D", "16D", "40s", (16 * 3600 * 24 - 8 * 3600 * 24) // 40 + 1), + ], + ) + def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_periods): + # GH 33498 only the cases where `(end % freq) == 0` used to fail + res = timedelta_range(start=start, end=end, freq=freq) + assert Timedelta(start) == res[0] + assert Timedelta(end) >= res[-1] + assert len(res) == expected_periods + + def test_timedelta_range_infer_freq(self): + # https://github.com/pandas-dev/pandas/issues/35897 + result = timedelta_range("0s", "1s", periods=31) + assert result.freq is None + + @pytest.mark.parametrize( + "freq_depr, start, end, expected_values, expected_freq", + [ + ( + "3.5S", + "05:03:01", + "05:03:10", + ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], + "3500ms", + ), + ( + "2.5T", + "5 hours", + "5 hours 8 minutes", + [ + "0 days 05:00:00", + "0 days 05:02:30", + "0 days 05:05:00", + "0 days 05:07:30", + ], + "150s", + ), + ], + ) + def test_timedelta_range_deprecated_freq( + self, freq_depr, start, end, expected_values, expected_freq + ): + # GH#52536 + msg = ( + f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = timedelta_range(start=start, end=end, freq=freq_depr) + expected = TimedeltaIndex( + expected_values, dtype="timedelta64[ns]", freq=expected_freq + ) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval.py new file mode 100644 index 0000000000000000000000000000000000000000..dd51917b85a59b2ae88ac0c029dadaa6ff8f19da --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval.py @@ -0,0 +1,225 @@ +import numpy as np +import pytest + +from pandas._libs import index as libindex + +import pandas as pd +from pandas import ( + DataFrame, + IntervalIndex, + Series, +) +import pandas._testing as tm + + +class TestIntervalIndex: + @pytest.fixture + def series_with_interval_index(self): + return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_getitem_with_scalar(self, series_with_interval_index, indexer_sl): + ser = series_with_interval_index.copy() + + expected = ser.iloc[:3] + tm.assert_series_equal(expected, indexer_sl(ser)[:3]) + tm.assert_series_equal(expected, indexer_sl(ser)[:2.5]) + tm.assert_series_equal(expected, indexer_sl(ser)[0.1:2.5]) + if indexer_sl is tm.loc: + tm.assert_series_equal(expected, ser.loc[-1:3]) + + expected = ser.iloc[1:4] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[2, 3, 4]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 3, 4]]) + + expected = ser.iloc[2:5] + tm.assert_series_equal(expected, indexer_sl(ser)[ser >= 2]) + + @pytest.mark.parametrize("direction", ["increasing", "decreasing"]) + def test_getitem_nonoverlapping_monotonic(self, direction, closed, indexer_sl): + tpls = [(0, 1), (2, 3), (4, 5)] + if direction == "decreasing": + tpls = tpls[::-1] + + idx = IntervalIndex.from_tuples(tpls, closed=closed) + ser = Series(list("abc"), idx) + + for key, expected in zip(idx.left, ser): + if idx.closed_left: + assert indexer_sl(ser)[key] == expected + else: + with pytest.raises(KeyError, match=str(key)): + indexer_sl(ser)[key] + + for key, expected in zip(idx.right, ser): + if idx.closed_right: + assert indexer_sl(ser)[key] == expected + else: + with pytest.raises(KeyError, match=str(key)): + indexer_sl(ser)[key] + + for key, expected in zip(idx.mid, ser): + assert indexer_sl(ser)[key] == expected + + def test_getitem_non_matching(self, series_with_interval_index, indexer_sl): + ser = series_with_interval_index.copy() + + # this is a departure from our current + # indexing scheme, but simpler + with pytest.raises(KeyError, match=r"\[-1\] not in index"): + indexer_sl(ser)[[-1, 3, 4, 5]] + + with pytest.raises(KeyError, match=r"\[-1\] not in index"): + indexer_sl(ser)[[-1, 3]] + + def test_loc_getitem_large_series(self, monkeypatch): + size_cutoff = 20 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + ser = Series( + np.arange(size_cutoff), + index=IntervalIndex.from_breaks(np.arange(size_cutoff + 1)), + ) + + result1 = ser.loc[:8] + result2 = ser.loc[0:8] + result3 = ser.loc[0:8:1] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + def test_loc_getitem_frame(self): + # CategoricalIndex with IntervalIndex categories + df = DataFrame({"A": range(10)}) + ser = pd.cut(df.A, 5) + df["B"] = ser + df = df.set_index("B") + + result = df.loc[4] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match="10"): + df.loc[10] + + # single list-like + result = df.loc[[4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + # non-unique + result = df.loc[[4, 5]] + expected = df.take([4, 5, 4, 5]) + tm.assert_frame_equal(result, expected) + + msg = ( + r"None of \[Index\(\[10\], dtype='object', name='B'\)\] " + r"are in the \[index\]" + ) + with pytest.raises(KeyError, match=msg): + df.loc[[10]] + + # partial missing + with pytest.raises(KeyError, match=r"\[10\] not in index"): + df.loc[[10, 4]] + + def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): + # GH#41831 + + index = IntervalIndex([np.nan, np.nan]) + key = index[:-1] + + obj = frame_or_series(range(2), index=index) + if frame_or_series is DataFrame and indexer_sl is tm.setitem: + obj = obj.T + + result = indexer_sl(obj)[key] + expected = obj + + tm.assert_equal(result, expected) + + def test_setitem_interval_with_slice(self): + # GH#54722 + ii = IntervalIndex.from_breaks(range(4, 15)) + ser = Series(range(10), index=ii) + + orig = ser.copy() + + # This should be a no-op (used to raise) + ser.loc[1:3] = 20 + tm.assert_series_equal(ser, orig) + + ser.loc[6:8] = 19 + orig.iloc[1:4] = 19 + tm.assert_series_equal(ser, orig) + + ser2 = Series(range(5), index=ii[::2]) + orig2 = ser2.copy() + + # this used to raise + ser2.loc[6:8] = 22 # <- raises on main, sets on branch + orig2.iloc[1] = 22 + tm.assert_series_equal(ser2, orig2) + + ser2.loc[5:7] = 21 + orig2.iloc[:2] = 21 + tm.assert_series_equal(ser2, orig2) + + +class TestIntervalIndexInsideMultiIndex: + def test_mi_intervalindex_slicing_with_scalar(self): + # GH#27456 + ii = IntervalIndex.from_arrays( + [0, 1, 10, 11, 0, 1, 10, 11], [1, 2, 11, 12, 1, 2, 11, 12], name="MP" + ) + idx = pd.MultiIndex.from_arrays( + [ + pd.Index(["FC", "FC", "FC", "FC", "OWNER", "OWNER", "OWNER", "OWNER"]), + pd.Index( + ["RID1", "RID1", "RID2", "RID2", "RID1", "RID1", "RID2", "RID2"] + ), + ii, + ] + ) + + idx.names = ["Item", "RID", "MP"] + df = DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8]}) + df.index = idx + + query_df = DataFrame( + { + "Item": ["FC", "OWNER", "FC", "OWNER", "OWNER"], + "RID": ["RID1", "RID1", "RID1", "RID2", "RID2"], + "MP": [0.2, 1.5, 1.6, 11.1, 10.9], + } + ) + + query_df = query_df.sort_index() + + idx = pd.MultiIndex.from_arrays([query_df.Item, query_df.RID, query_df.MP]) + query_df.index = idx + result = df.value.loc[query_df.index] + + # the IntervalIndex level is indexed with floats, which map to + # the intervals containing them. Matching the behavior we would get + # with _only_ an IntervalIndex, we get an IntervalIndex level back. + sliced_level = ii.take([0, 1, 1, 3, 2]) + expected_index = pd.MultiIndex.from_arrays( + [idx.get_level_values(0), idx.get_level_values(1), sliced_level] + ) + expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "base", + [101, 1010], + ) + def test_reindex_behavior_with_interval_index(self, base): + # GH 51826 + + ser = Series( + range(base), + index=IntervalIndex.from_arrays(range(base), range(1, base + 1)), + ) + expected_result = Series([np.nan, 0], index=[np.nan, 1.0], dtype=float) + result = ser.reindex(index=[np.nan, 1.0]) + tm.assert_series_equal(result, expected_result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval_new.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval_new.py new file mode 100644 index 0000000000000000000000000000000000000000..018db5846f4e269efe69de91b30b461822872410 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/interval/test_interval_new.py @@ -0,0 +1,229 @@ +import re + +import numpy as np +import pytest + +from pandas import ( + Index, + Interval, + IntervalIndex, + Series, +) +import pandas._testing as tm + + +class TestIntervalIndex: + @pytest.fixture + def series_with_interval_index(self): + return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_loc_with_interval(self, series_with_interval_index, indexer_sl): + # loc with single label / list of labels: + # - Intervals: only exact matches + # - scalars: those that contain it + + ser = series_with_interval_index.copy() + + expected = 0 + result = indexer_sl(ser)[Interval(0, 1)] + assert result == expected + + expected = ser.iloc[3:5] + result = indexer_sl(ser)[[Interval(3, 4), Interval(4, 5)]] + tm.assert_series_equal(expected, result) + + # missing or not exact + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): + indexer_sl(ser)[Interval(3, 5, closed="left")] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + indexer_sl(ser)[Interval(3, 5)] + + with pytest.raises( + KeyError, match=re.escape("Interval(-2, 0, closed='right')") + ): + indexer_sl(ser)[Interval(-2, 0)] + + with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): + indexer_sl(ser)[Interval(5, 6)] + + def test_loc_with_scalar(self, series_with_interval_index, indexer_sl): + # loc with single label / list of labels: + # - Intervals: only exact matches + # - scalars: those that contain it + + ser = series_with_interval_index.copy() + + assert indexer_sl(ser)[1] == 0 + assert indexer_sl(ser)[1.5] == 1 + assert indexer_sl(ser)[2] == 1 + + expected = ser.iloc[1:4] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[2, 3, 4]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 3, 4]]) + + expected = ser.iloc[[1, 1, 2, 1]] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2, 2.5, 1.5]]) + + expected = ser.iloc[2:5] + tm.assert_series_equal(expected, indexer_sl(ser)[ser >= 2]) + + def test_loc_with_slices(self, series_with_interval_index, indexer_sl): + # loc with slices: + # - Interval objects: only works with exact matches + # - scalars: only works for non-overlapping, monotonic intervals, + # and start/stop select location based on the interval that + # contains them: + # (slice_loc(start, stop) == (idx.get_loc(start), idx.get_loc(stop)) + + ser = series_with_interval_index.copy() + + # slice of interval + + expected = ser.iloc[:3] + result = indexer_sl(ser)[Interval(0, 1) : Interval(2, 3)] + tm.assert_series_equal(expected, result) + + expected = ser.iloc[3:] + result = indexer_sl(ser)[Interval(3, 4) :] + tm.assert_series_equal(expected, result) + + msg = "Interval objects are not currently supported" + with pytest.raises(NotImplementedError, match=msg): + indexer_sl(ser)[Interval(3, 6) :] + + with pytest.raises(NotImplementedError, match=msg): + indexer_sl(ser)[Interval(3, 4, closed="left") :] + + def test_slice_step_ne1(self, series_with_interval_index): + # GH#31658 slice of scalar with step != 1 + ser = series_with_interval_index.copy() + expected = ser.iloc[0:4:2] + + result = ser[0:4:2] + tm.assert_series_equal(result, expected) + + result2 = ser[0:4][::2] + tm.assert_series_equal(result2, expected) + + def test_slice_float_start_stop(self, series_with_interval_index): + # GH#31658 slicing with integers is positional, with floats is not + # supported + ser = series_with_interval_index.copy() + + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + ser[1.5:9.5:2] + + def test_slice_interval_step(self, series_with_interval_index): + # GH#31658 allows for integer step!=1, not Interval step + ser = series_with_interval_index.copy() + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + ser[0 : 4 : Interval(0, 1)] + + def test_loc_with_overlap(self, indexer_sl): + idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) + ser = Series(range(len(idx)), index=idx) + + # scalar + expected = ser + result = indexer_sl(ser)[4] + tm.assert_series_equal(expected, result) + + result = indexer_sl(ser)[[4]] + tm.assert_series_equal(expected, result) + + # interval + expected = 0 + result = indexer_sl(ser)[Interval(1, 5)] + assert expected == result + + expected = ser + result = indexer_sl(ser)[[Interval(1, 5), Interval(3, 7)]] + tm.assert_series_equal(expected, result) + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + indexer_sl(ser)[Interval(3, 5)] + + msg = ( + r"None of \[IntervalIndex\(\[\(3, 5\]\], " + r"dtype='interval\[int64, right\]'\)\] are in the \[index\]" + ) + with pytest.raises(KeyError, match=msg): + indexer_sl(ser)[[Interval(3, 5)]] + + # slices with interval (only exact matches) + expected = ser + result = indexer_sl(ser)[Interval(1, 5) : Interval(3, 7)] + tm.assert_series_equal(expected, result) + + msg = ( + "'can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing'" + ) + with pytest.raises(KeyError, match=msg): + indexer_sl(ser)[Interval(1, 6) : Interval(3, 8)] + + if indexer_sl is tm.loc: + # slices with scalar raise for overlapping intervals + # TODO KeyError is the appropriate error? + with pytest.raises(KeyError, match=msg): + ser.loc[1:4] + + def test_non_unique(self, indexer_sl): + idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) + ser = Series(range(len(idx)), index=idx) + + result = indexer_sl(ser)[Interval(1, 3)] + assert result == 0 + + result = indexer_sl(ser)[[Interval(1, 3)]] + expected = ser.iloc[0:1] + tm.assert_series_equal(expected, result) + + def test_non_unique_moar(self, indexer_sl): + idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) + ser = Series(range(len(idx)), index=idx) + + expected = ser.iloc[[0, 1]] + result = indexer_sl(ser)[Interval(1, 3)] + tm.assert_series_equal(expected, result) + + expected = ser + result = indexer_sl(ser)[Interval(1, 3) :] + tm.assert_series_equal(expected, result) + + expected = ser.iloc[[0, 1]] + result = indexer_sl(ser)[[Interval(1, 3)]] + tm.assert_series_equal(expected, result) + + def test_loc_getitem_missing_key_error_message( + self, frame_or_series, series_with_interval_index + ): + # GH#27365 + ser = series_with_interval_index.copy() + obj = frame_or_series(ser) + with pytest.raises(KeyError, match=r"\[6\]"): + obj.loc[[4, 5, 6]] + + +@pytest.mark.parametrize( + "intervals", + [ + ([Interval(-np.inf, 0.0), Interval(0.0, 1.0)]), + ([Interval(-np.inf, -2.0), Interval(-2.0, -1.0)]), + ([Interval(-1.0, 0.0), Interval(0.0, np.inf)]), + ([Interval(1.0, 2.0), Interval(2.0, np.inf)]), + ], +) +def test_repeating_interval_index_with_infs(intervals): + # GH 46658 + + interval_index = Index(intervals * 51) + + expected = np.arange(1, 102, 2, dtype=np.intp) + result = interval_index.get_indexer_for([intervals[1]]) + + tm.assert_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py new file mode 100644 index 0000000000000000000000000000000000000000..0dd1a56890fee90e49646ff2a1fe87c6249b3f57 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -0,0 +1,87 @@ +import numpy as np +import pytest + +from pandas._libs import index as libindex +from pandas.errors import SettingWithCopyError +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + MultiIndex, + Series, +) +import pandas._testing as tm + + +def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): + # Inplace ops, originally from: + # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + a = [12, 23] + b = [123, None] + c = [1234, 2345] + d = [12345, 23456] + tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] + events = { + ("eyes", "left"): a, + ("eyes", "right"): b, + ("ears", "left"): c, + ("ears", "right"): d, + } + multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) + zed = DataFrame(events, index=["a", "b"], columns=multiind) + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + zed["eyes"]["right"].fillna(value=555, inplace=True) + elif warn_copy_on_write: + with tm.assert_produces_warning(None): + zed["eyes"]["right"].fillna(value=555, inplace=True) + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + with tm.assert_produces_warning(None): + zed["eyes"]["right"].fillna(value=555, inplace=True) + + +@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view +def test_cache_updating(using_copy_on_write, warn_copy_on_write): + # 5216 + # make sure that we don't try to set a dead cache + a = np.random.default_rng(2).random((10, 3)) + df = DataFrame(a, columns=["x", "y", "z"]) + df_original = df.copy() + tuples = [(i, j) for i in range(5) for j in range(2)] + index = MultiIndex.from_tuples(tuples) + df.index = index + + # setting via chained assignment + # but actually works, since everything is a view + + with tm.raises_chained_assignment_error(): + df.loc[0]["z"].iloc[0] = 1.0 + + if using_copy_on_write: + assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"] + else: + result = df.loc[(0, 0), "z"] + assert result == 1 + + # correct setting + df.loc[(0, 0), "z"] = 2 + result = df.loc[(0, 0), "z"] + assert result == 2 + + +def test_indexer_caching(monkeypatch): + # GH5727 + # make sure that indexers are in the _internal_names_set + size_cutoff = 20 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + index = MultiIndex.from_arrays([np.arange(size_cutoff), np.arange(size_cutoff)]) + s = Series(np.zeros(size_cutoff), index=index) + + # setitem + s[s == 0] = 1 + expected = Series(np.ones(size_cutoff), index=index) + tm.assert_series_equal(s, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_datetime.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_datetime.py new file mode 100644 index 0000000000000000000000000000000000000000..d325971e7baf69fb3119afc018c6f90da93e0d3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_datetime.py @@ -0,0 +1,50 @@ +from datetime import datetime + +import numpy as np + +from pandas import ( + DataFrame, + Index, + MultiIndex, + Period, + Series, + period_range, + to_datetime, +) +import pandas._testing as tm + + +def test_multiindex_period_datetime(): + # GH4861, using datetime in period of multiindex raises exception + + idx1 = Index(["a", "a", "a", "b", "b"]) + idx2 = period_range("2012-01", periods=len(idx1), freq="M") + s = Series(np.random.default_rng(2).standard_normal(len(idx1)), [idx1, idx2]) + + # try Period as index + expected = s.iloc[0] + result = s.loc["a", Period("2012-01")] + assert result == expected + + # try datetime as index + result = s.loc["a", datetime(2012, 1, 1)] + assert result == expected + + +def test_multiindex_datetime_columns(): + # GH35015, using datetime as column indices raises exception + + mi = MultiIndex.from_tuples( + [(to_datetime("02/29/2020"), to_datetime("03/01/2020"))], names=["a", "b"] + ) + + df = DataFrame([], columns=mi) + + expected_df = DataFrame( + [], + columns=MultiIndex.from_arrays( + [[to_datetime("02/29/2020")], [to_datetime("03/01/2020")]], names=["a", "b"] + ), + ) + + tm.assert_frame_equal(df, expected_df) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_getitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_getitem.py new file mode 100644 index 0000000000000000000000000000000000000000..b86e233110e882d3c9a71720bfc0b725bfd46923 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_getitem.py @@ -0,0 +1,410 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm +from pandas.core.indexing import IndexingError + +# ---------------------------------------------------------------------------- +# test indexing of Series with multi-level Index +# ---------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "access_method", + [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)], +) +@pytest.mark.parametrize( + "level1_value, expected", + [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))], +) +def test_series_getitem_multiindex(access_method, level1_value, expected): + # GH 6018 + # series regression getitem with a multi-index + + mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"]) + ser = Series([1, 2, 3], index=mi) + expected.index.name = "A" + + result = access_method(ser, level1_value) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("level0_value", ["D", "A"]) +def test_series_getitem_duplicates_multiindex(level0_value): + # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise + # the appropriate error, only in PY3 of course! + + index = MultiIndex( + levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + arr = np.random.default_rng(2).standard_normal((len(index), 1)) + df = DataFrame(arr, index=index, columns=["val"]) + + # confirm indexing on missing value raises KeyError + if level0_value != "A": + with pytest.raises(KeyError, match=r"^'A'$"): + df.val["A"] + + with pytest.raises(KeyError, match=r"^'X'$"): + df.val["X"] + + result = df.val[level0_value] + expected = Series( + arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day") + ) + tm.assert_series_equal(result, expected) + + +def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer_sl): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + + result = indexer_sl(s)[2000, 3] + tm.assert_series_equal(result, expected) + + +def test_series_getitem_returns_scalar( + multiindex_year_month_day_dataframe_random_data, indexer_sl +): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.iloc[49] + + result = indexer_sl(s)[2000, 3, 10] + assert result == expected + + +@pytest.mark.parametrize( + "indexer,expected_error,expected_error_msg", + [ + (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), + (lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s) + (lambda s: s[len(s)], KeyError, ""), # match should include len(s) + ( + lambda s: s.iloc[len(s)], + IndexError, + "single positional indexer is out-of-bounds", + ), + ], +) +def test_series_getitem_indexing_errors( + multiindex_year_month_day_dataframe_random_data, + indexer, + expected_error, + expected_error_msg, +): + s = multiindex_year_month_day_dataframe_random_data["A"] + with pytest.raises(expected_error, match=expected_error_msg): + indexer(s) + + +def test_series_getitem_corner_generator( + multiindex_year_month_day_dataframe_random_data, +): + s = multiindex_year_month_day_dataframe_random_data["A"] + result = s[(x > 0 for x in s)] + expected = s[s > 0] + tm.assert_series_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# test indexing of DataFrame with multi-level Index +# ---------------------------------------------------------------------------- + + +def test_getitem_simple(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data.T + expected = df.values[:, 0] + result = df["foo", "one"].values + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer,expected_error_msg", + [ + (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"), + (lambda df: df["foobar"], r"^'foobar'$"), + ], +) +def test_frame_getitem_simple_key_error( + multiindex_dataframe_random_data, indexer, expected_error_msg +): + df = multiindex_dataframe_random_data.T + with pytest.raises(KeyError, match=expected_error_msg): + indexer(df) + + +def test_tuple_string_column_names(): + # GH#50372 + mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")]) + df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi) + df["single_index"] = 0 + + df_flat = df.copy() + df_flat.columns = df_flat.columns.to_flat_index() + df_flat["new_single_index"] = 0 + + result = df_flat[[("a", "aa"), "new_single_index"]] + expected = DataFrame( + [[0, 0], [1, 0], [2, 0]], columns=Index([("a", "aa"), "new_single_index"]) + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_getitem_multicolumn_empty_level(): + df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) + df.columns = [ + ["level1 item1", "level1 item2"], + ["", "level2 item2"], + ["level3 item1", "level3 item2"], + ] + + result = df["level1 item1"] + expected = DataFrame( + [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer,expected_slice", + [ + (lambda df: df["foo"], slice(3)), + (lambda df: df["bar"], slice(3, 5)), + (lambda df: df.loc[:, "bar"], slice(3, 5)), + ], +) +def test_frame_getitem_toplevel( + multiindex_dataframe_random_data, indexer, expected_slice +): + df = multiindex_dataframe_random_data.T + expected = df.reindex(columns=df.columns[expected_slice]) + expected.columns = expected.columns.droplevel(0) + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_frame_mixed_depth_get(): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + result = df["a"] + expected = df["a", "", ""].rename("a") + tm.assert_series_equal(result, expected) + + result = df["routine1", "result1"] + expected = df["routine1", "result1", ""] + expected = expected.rename(("routine1", "result1")) + tm.assert_series_equal(result, expected) + + +def test_frame_getitem_nan_multiindex(nulls_fixture): + # GH#29751 + # loc on a multiindex containing nan values + n = nulls_fixture # for code readability + cols = ["a", "b", "c"] + df = DataFrame( + [[11, n, 13], [21, n, 23], [31, n, 33], [41, n, 43]], + columns=cols, + ).set_index(["a", "b"]) + df["c"] = df["c"].astype("int64") + + idx = (21, n) + result = df.loc[:idx] + expected = DataFrame([[11, n, 13], [21, n, 23]], columns=cols).set_index(["a", "b"]) + expected["c"] = expected["c"].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.loc[idx:] + expected = DataFrame( + [[21, n, 23], [31, n, 33], [41, n, 43]], columns=cols + ).set_index(["a", "b"]) + expected["c"] = expected["c"].astype("int64") + tm.assert_frame_equal(result, expected) + + idx1, idx2 = (21, n), (31, n) + result = df.loc[idx1:idx2] + expected = DataFrame([[21, n, 23], [31, n, 33]], columns=cols).set_index(["a", "b"]) + expected["c"] = expected["c"].astype("int64") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer,expected", + [ + ( + (["b"], ["bar", np.nan]), + ( + DataFrame( + [[2, 3], [5, 6]], + columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]), + dtype="int64", + ) + ), + ), + ( + (["a", "b"]), + ( + DataFrame( + [[1, 2, 3], [4, 5, 6]], + columns=MultiIndex.from_tuples( + [("a", "foo"), ("b", "bar"), ("b", np.nan)] + ), + dtype="int64", + ) + ), + ), + ( + (["b"]), + ( + DataFrame( + [[2, 3], [5, 6]], + columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]), + dtype="int64", + ) + ), + ), + ( + (["b"], ["bar"]), + ( + DataFrame( + [[2], [5]], + columns=MultiIndex.from_tuples([("b", "bar")]), + dtype="int64", + ) + ), + ), + ( + (["b"], [np.nan]), + ( + DataFrame( + [[3], [6]], + columns=MultiIndex( + codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]] + ), + dtype="int64", + ) + ), + ), + (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))), + ], +) +def test_frame_getitem_nan_cols_multiindex( + indexer, + expected, + nulls_fixture, +): + # Slicing MultiIndex including levels with nan values, for more information + # see GH#25154 + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], + columns=MultiIndex.from_tuples( + [("a", "foo"), ("b", "bar"), ("b", nulls_fixture)] + ), + dtype="int64", + ) + + result = df.loc[:, indexer] + tm.assert_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# test indexing of DataFrame with multi-level Index with duplicates +# ---------------------------------------------------------------------------- + + +@pytest.fixture +def dataframe_with_duplicate_index(): + """Fixture for DataFrame used in tests for gh-4145 and gh-4146""" + data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]] + index = ["h1", "h3", "h5"] + columns = MultiIndex( + levels=[["A", "B"], ["A1", "A2", "B1", "B2"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]], + names=["main", "sub"], + ) + return DataFrame(data, index=index, columns=columns) + + +@pytest.mark.parametrize( + "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]] +) +def test_frame_mi_access(dataframe_with_duplicate_index, indexer): + # GH 4145 + df = dataframe_with_duplicate_index + index = Index(["h1", "h3", "h5"]) + columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"]) + expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T + + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_frame_mi_access_returns_series(dataframe_with_duplicate_index): + # GH 4146, not returning a block manager when selecting a unique index + # from a duplicate index + # as of 4879, this returns a Series (which is similar to what happens + # with a non-unique) + df = dataframe_with_duplicate_index + expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1") + result = df["A"]["A1"] + tm.assert_series_equal(result, expected) + + +def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index): + # selecting a non_unique from the 2nd level + df = dataframe_with_duplicate_index + expected = DataFrame( + [["d", 4, 4], ["e", 5, 5]], + index=Index(["B2", "B2"], name="sub"), + columns=["h1", "h3", "h5"], + ).T + result = df["A"]["B2"] + tm.assert_frame_equal(result, expected) + + +def test_frame_mi_empty_slice(): + # GH 15454 + df = DataFrame(0, index=range(2), columns=MultiIndex.from_product([[1], [2]])) + result = df[[]] + expected = DataFrame( + index=[0, 1], columns=MultiIndex(levels=[[1], [2]], codes=[[], []]) + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_empty_multiindex(): + # GH#36936 + arrays = [["a", "a", "b", "a"], ["a", "a", "b", "b"]] + index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) + df = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) + + # loc on empty multiindex == loc with False mask + empty_multiindex = df.loc[df.loc[:, "value"] == 0, :].index + result = df.loc[empty_multiindex, :] + expected = df.loc[[False] * len(df.index), :] + tm.assert_frame_equal(result, expected) + + # replacing value with loc on empty multiindex + df.loc[df.loc[df.loc[:, "value"] == 0].index, "value"] = 5 + result = df + expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py new file mode 100644 index 0000000000000000000000000000000000000000..c6fc1659500e62423f20cca44b40762bee60509d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@pytest.fixture +def m(): + return 5 + + +@pytest.fixture +def n(): + return 100 + + +@pytest.fixture +def cols(): + return ["jim", "joe", "jolie", "joline", "jolia"] + + +@pytest.fixture +def vals(n): + vals = [ + np.random.default_rng(2).integers(0, 10, n), + np.random.default_rng(2).choice(list("abcdefghij"), n), + np.random.default_rng(2).choice( + pd.date_range("20141009", periods=10).tolist(), n + ), + np.random.default_rng(2).choice(list("ZYXWVUTSRQ"), n), + np.random.default_rng(2).standard_normal(n), + ] + vals = list(map(tuple, zip(*vals))) + return vals + + +@pytest.fixture +def keys(n, m, vals): + # bunch of keys for testing + keys = [ + np.random.default_rng(2).integers(0, 11, m), + np.random.default_rng(2).choice(list("abcdefghijk"), m), + np.random.default_rng(2).choice( + pd.date_range("20141009", periods=11).tolist(), m + ), + np.random.default_rng(2).choice(list("ZYXWVUTSRQP"), m), + ] + keys = list(map(tuple, zip(*keys))) + keys += [t[:-1] for t in vals[:: n // m]] + return keys + + +# covers both unique index and non-unique index +@pytest.fixture +def df(vals, cols): + return DataFrame(vals, columns=cols) + + +@pytest.fixture +def a(df): + return pd.concat([df, df]) + + +@pytest.fixture +def b(df, cols): + return df.drop_duplicates(subset=cols[:-1]) + + +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +@pytest.mark.parametrize("lexsort_depth", list(range(5))) +@pytest.mark.parametrize("frame_fixture", ["a", "b"]) +def test_multiindex_get_loc(request, lexsort_depth, keys, frame_fixture, cols): + # GH7724, GH2646 + + frame = request.getfixturevalue(frame_fixture) + if lexsort_depth == 0: + df = frame.copy(deep=False) + else: + df = frame.sort_values(by=cols[:lexsort_depth]) + + mi = df.set_index(cols[:-1]) + assert not mi.index._lexsort_depth < lexsort_depth + for key in keys: + mask = np.ones(len(df), dtype=bool) + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + assert key[: i + 1] not in mi.index + continue + + assert key[: i + 1] in mi.index + right = df[mask].copy(deep=False) + + if i + 1 != len(key): # partial key + return_value = right.drop(cols[: i + 1], axis=1, inplace=True) + assert return_value is None + return_value = right.set_index(cols[i + 1 : -1], inplace=True) + assert return_value is None + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) + + else: # full key + return_value = right.set_index(cols[:-1], inplace=True) + assert return_value is None + if len(right) == 1: # single hit + right = Series( + right["jolia"].values, name=right.index[0], index=["jolia"] + ) + tm.assert_series_equal(mi.loc[key[: i + 1]], right) + else: # multi hit + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_loc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_loc.py new file mode 100644 index 0000000000000000000000000000000000000000..5508153322adba90ac2739d30161f8b824b54eef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_loc.py @@ -0,0 +1,992 @@ +import numpy as np +import pytest + +from pandas.errors import ( + IndexingError, + PerformanceWarning, +) + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +@pytest.fixture +def single_level_multiindex(): + """single level MultiIndex""" + return MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) + + +@pytest.fixture +def frame_random_data_integer_multi_index(): + levels = [[0, 1], [0, 1, 2]] + codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, codes=codes) + return DataFrame(np.random.default_rng(2).standard_normal((6, 2)), index=index) + + +class TestMultiIndexLoc: + def test_loc_setitem_frame_with_multiindex(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + frame.loc[("bar", "two"), "B"] = 5 + assert frame.loc[("bar", "two"), "B"] == 5 + + # with integer labels + df = frame.copy() + df.columns = list(range(3)) + df.loc[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 + + def test_loc_getitem_general(self, any_real_numpy_dtype): + # GH#2817 + dtype = any_real_numpy_dtype + data = { + "amount": {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + "col": {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + "num": {0: 12, 1: 11, 2: 12, 3: 12, 4: 12}, + } + df = DataFrame(data) + df = df.astype({"col": dtype, "num": dtype}) + df = df.set_index(keys=["col", "num"]) + key = 4.0, 12 + + # emits a PerformanceWarning, ok + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) + + # this is ok + return_value = df.sort_index(inplace=True) + assert return_value is None + res = df.loc[key] + + # col has float dtype, result should be float64 Index + col_arr = np.array([4.0] * 3, dtype=dtype) + year_arr = np.array([12] * 3, dtype=dtype) + index = MultiIndex.from_arrays([col_arr, year_arr], names=["col", "num"]) + expected = DataFrame({"amount": [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) + + def test_loc_getitem_multiindex_missing_label_raises(self): + # GH#21593 + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + with pytest.raises(KeyError, match=r"^2$"): + df.loc[2] + + def test_loc_getitem_list_of_tuples_with_multiindex( + self, multiindex_year_month_day_dataframe_random_data + ): + ser = multiindex_year_month_day_dataframe_random_data["A"] + expected = ser.reindex(ser.index[49:51]) + result = ser.loc[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_series(self): + # GH14730 + # passing a series as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = Series([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + result = x.loc[[1, 3]] + tm.assert_series_equal(result, expected) + + # GH15424 + y1 = Series([1, 3], index=[1, 2]) + result = x.loc[y1] + tm.assert_series_equal(result, expected) + + empty = Series(data=[], dtype=np.float64) + expected = Series( + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype=np.float64, + ) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_array(self): + # GH15434 + # passing an array as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = np.array([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + # empty array: + empty = np.array([]) + expected = Series( + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype="float64", + ) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + # 0-dim array (scalar): + scalar = np.int64(1) + expected = Series(data=[0, 1, 2], index=["A", "B", "C"], dtype=np.float64) + result = x.loc[scalar] + tm.assert_series_equal(result, expected) + + def test_loc_multiindex_labels(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) + + # the first 2 rows + expected = df.iloc[[0, 1]].droplevel(0) + result = df.loc["i"] + tm.assert_frame_equal(result, expected) + + # 2nd (last) column + expected = df.iloc[:, [2]].droplevel(0, axis=1) + result = df.loc[:, "j"] + tm.assert_frame_equal(result, expected) + + # bottom right corner + expected = df.iloc[[2], [2]].droplevel(0).droplevel(0, axis=1) + result = df.loc["j"].loc[:, "j"] + tm.assert_frame_equal(result, expected) + + # with a tuple + expected = df.iloc[[0, 1]] + result = df.loc[("i", "X")] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_ints(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + expected = df.iloc[[0, 1]].droplevel(0) + result = df.loc[4] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_missing_label_raises(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + with pytest.raises(KeyError, match=r"^2$"): + df.loc[2] + + @pytest.mark.parametrize("key, pos", [([2, 4], [0, 1]), ([2], []), ([2, 3], [])]) + def test_loc_multiindex_list_missing_label(self, key, pos): + # GH 27148 - lists with missing labels _do_ raise + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + with pytest.raises(KeyError, match="not in index"): + df.loc[key] + + def test_loc_multiindex_too_many_dims_raises(self): + # GH 14885 + s = Series( + range(8), + index=MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + ) + + with pytest.raises(KeyError, match=r"^\('a', 'b'\)$"): + s.loc["a", "b"] + with pytest.raises(KeyError, match=r"^\('a', 'd', 'g'\)$"): + s.loc["a", "d", "g"] + with pytest.raises(IndexingError, match="Too many indexers"): + s.loc["a", "d", "g", "j"] + + def test_loc_multiindex_indexer_none(self): + # GH6788 + # multi-index indexer is None (meaning take all) + attributes = ["Attribute" + str(i) for i in range(1)] + attribute_values = ["Value" + str(i) for i in range(5)] + + index = MultiIndex.from_product([attributes, attribute_values]) + df = 0.1 * np.random.default_rng(2).standard_normal((10, 1 * 5)) + 0.5 + df = DataFrame(df, columns=index) + result = df[attributes] + tm.assert_frame_equal(result, df) + + # GH 7349 + # loc with a multi-index seems to be doing fallback + df = DataFrame( + np.arange(12).reshape(-1, 1), + index=MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]]), + ) + + expected = df.loc[([1, 2],), :] + result = df.loc[[1, 2]] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): + # GH 7399 + # incomplete indexers + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.loc[:, "a":"c"] + + result = s.loc[0:4, "a":"c"] + tm.assert_series_equal(result, expected) + + result = s.loc[:4, "a":"c"] + tm.assert_series_equal(result, expected) + + result = s.loc[0:, "a":"c"] + tm.assert_series_equal(result, expected) + + # GH 7400 + # multiindexer getitem with list of indexers skips wrong element + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.iloc[[6, 7, 8, 12, 13, 14]] + result = s.loc[2:4:2, "a":"c"] + tm.assert_series_equal(result, expected) + + def test_get_loc_single_level(self, single_level_multiindex): + single_level = single_level_multiindex + s = Series( + np.random.default_rng(2).standard_normal(len(single_level)), + index=single_level, + ) + for k in single_level.values: + s[k] + + def test_loc_getitem_int_slice(self): + # GH 3053 + # loc should treat integer slices like label slices + + index = MultiIndex.from_product([[6, 7, 8], ["a", "b"]]) + df = DataFrame(np.random.default_rng(2).standard_normal((6, 6)), index, index) + result = df.loc[6:8, :] + expected = df + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([[10, 20, 30], ["a", "b"]]) + df = DataFrame(np.random.default_rng(2).standard_normal((6, 6)), index, index) + result = df.loc[20:30, :] + expected = df.iloc[2:] + tm.assert_frame_equal(result, expected) + + # doc examples + result = df.loc[10, :] + expected = df.iloc[0:2] + expected.index = ["a", "b"] + tm.assert_frame_equal(result, expected) + + result = df.loc[:, 10] + expected = df[10] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer_type_1", (list, tuple, set, slice, np.ndarray, Series, Index) + ) + @pytest.mark.parametrize( + "indexer_type_2", (list, tuple, set, slice, np.ndarray, Series, Index) + ) + def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): + # GH #19686 + # .loc should work with nested indexers which can be + # any list-like objects (see `is_list_like` (`pandas.api.types`)) or slices + + def convert_nested_indexer(indexer_type, keys): + if indexer_type == np.ndarray: + return np.array(keys) + if indexer_type == slice: + return slice(*keys) + return indexer_type(keys) + + a = [10, 20, 30] + b = [1, 2, 3] + index = MultiIndex.from_product([a, b]) + df = DataFrame( + np.arange(len(index), dtype="int64"), index=index, columns=["Data"] + ) + + keys = ([10, 20], [2, 3]) + types = (indexer_type_1, indexer_type_2) + + # check indexers with all the combinations of nested objects + # of all the valid types + indexer = tuple( + convert_nested_indexer(indexer_type, k) + for indexer_type, k in zip(types, keys) + ) + if indexer_type_1 is set or indexer_type_2 is set: + with pytest.raises(TypeError, match="as an indexer is not supported"): + df.loc[indexer, "Data"] + + return + else: + result = df.loc[indexer, "Data"] + expected = Series( + [1, 2, 4, 5], name="Data", index=MultiIndex.from_product(keys) + ) + + tm.assert_series_equal(result, expected) + + def test_multiindex_loc_one_dimensional_tuple(self, frame_or_series): + # GH#37711 + mi = MultiIndex.from_tuples([("a", "A"), ("b", "A")]) + obj = frame_or_series([1, 2], index=mi) + obj.loc[("a",)] = 0 + expected = frame_or_series([0, 2], index=mi) + tm.assert_equal(obj, expected) + + @pytest.mark.parametrize("indexer", [("a",), ("a")]) + def test_multiindex_one_dimensional_tuple_columns(self, indexer): + # GH#37711 + mi = MultiIndex.from_tuples([("a", "A"), ("b", "A")]) + obj = DataFrame([1, 2], index=mi) + obj.loc[indexer, :] = 0 + expected = DataFrame([0, 2], index=mi) + tm.assert_frame_equal(obj, expected) + + @pytest.mark.parametrize( + "indexer, exp_value", [(slice(None), 1.0), ((1, 2), np.nan)] + ) + def test_multiindex_setitem_columns_enlarging(self, indexer, exp_value): + # GH#39147 + mi = MultiIndex.from_tuples([(1, 2), (3, 4)]) + df = DataFrame([[1, 2], [3, 4]], index=mi, columns=["a", "b"]) + df.loc[indexer, ["c", "d"]] = 1.0 + expected = DataFrame( + [[1, 2, 1.0, 1.0], [3, 4, exp_value, exp_value]], + index=mi, + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(df, expected) + + def test_sorted_multiindex_after_union(self): + # GH#44752 + midx = MultiIndex.from_product( + [pd.date_range("20110101", periods=2), Index(["a", "b"])] + ) + ser1 = Series(1, index=midx) + ser2 = Series(1, index=midx[:2]) + df = pd.concat([ser1, ser2], axis=1) + expected = df.copy() + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + + df = DataFrame({0: ser1, 1: ser2}) + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + + df = pd.concat([ser1, ser2.reindex(ser1.index)], axis=1) + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + + def test_loc_no_second_level_index(self): + # GH#43599 + df = DataFrame( + index=MultiIndex.from_product([list("ab"), list("cd"), list("e")]), + columns=["Val"], + ) + res = df.loc[np.s_[:, "c", :]] + expected = DataFrame( + index=MultiIndex.from_product([list("ab"), list("e")]), columns=["Val"] + ) + tm.assert_frame_equal(res, expected) + + def test_loc_multi_index_key_error(self): + # GH 51892 + df = DataFrame( + { + (1, 2): ["a", "b", "c"], + (1, 3): ["d", "e", "f"], + (2, 2): ["g", "h", "i"], + (2, 4): ["j", "k", "l"], + } + ) + with pytest.raises(KeyError, match=r"(1, 4)"): + df.loc[0, (1, 4)] + + +@pytest.mark.parametrize( + "indexer, pos", + [ + ([], []), # empty ok + (["A"], slice(3)), + (["A", "D"], []), # "D" isn't present -> raise + (["D", "E"], []), # no values found -> raise + (["D"], []), # same, with single item list: GH 27148 + (pd.IndexSlice[:, ["foo"]], slice(2, None, 3)), + (pd.IndexSlice[:, ["foo", "bah"]], slice(2, None, 3)), + ], +) +def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos): + # GH 7866 + # multi-index slicing with missing indexers + idx = MultiIndex.from_product( + [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + ser = Series(np.arange(9, dtype="int64"), index=idx).sort_index() + expected = ser.iloc[pos] + + if expected.size == 0 and indexer != []: + with pytest.raises(KeyError, match=str(indexer)): + ser.loc[indexer] + elif indexer == (slice(None), ["foo", "bah"]): + # "bah" is not in idx.levels[1], raising KeyError enforced in 2.0 + with pytest.raises(KeyError, match="'bah'"): + ser.loc[indexer] + else: + result = ser.loc[indexer] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns_indexer", [([], slice(None)), (["foo"], [])]) +def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): + # GH 8737 + # empty indexer + multi_index = MultiIndex.from_product((["foo", "bar", "baz"], ["alpha", "beta"])) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 6)), + index=range(5), + columns=multi_index, + ) + df = df.sort_index(level=0, axis=1) + + expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) + result = df.loc[:, columns_indexer] + tm.assert_frame_equal(result, expected) + + +def test_loc_getitem_duplicates_multiindex_non_scalar_type_object(): + # regression from < 0.14.0 + # GH 7914 + df = DataFrame( + [[np.mean, np.median], ["mean", "median"]], + columns=MultiIndex.from_tuples([("functs", "mean"), ("functs", "median")]), + index=["function", "name"], + ) + result = df.loc["function", ("functs", "mean")] + expected = np.mean + assert result == expected + + +def test_loc_getitem_tuple_plus_slice(): + # GH 671 + df = DataFrame( + { + "a": np.arange(10), + "b": np.arange(10), + "c": np.random.default_rng(2).standard_normal(10), + "d": np.random.default_rng(2).standard_normal(10), + } + ).set_index(["a", "b"]) + expected = df.loc[0, 0] + result = df.loc[(0, 0), :] + tm.assert_series_equal(result, expected) + + +def test_loc_getitem_int(frame_random_data_integer_multi_index): + df = frame_random_data_integer_multi_index + result = df.loc[1] + expected = df[-3:] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + + +def test_loc_getitem_int_raises_exception(frame_random_data_integer_multi_index): + df = frame_random_data_integer_multi_index + with pytest.raises(KeyError, match=r"^3$"): + df.loc[3] + + +def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + + # test setup - check key not in dataframe + with pytest.raises(KeyError, match=r"^\('bar', 'three'\)$"): + df.loc[("bar", "three"), "B"] + + # in theory should be inserting in a sorted space???? + df.loc[("bar", "three"), "B"] = 0 + expected = 0 + result = df.sort_index().loc[("bar", "three"), "B"] + assert result == expected + + +def test_loc_setitem_single_column_slice(): + # case from https://github.com/pandas-dev/pandas/issues/27841 + df = DataFrame( + "string", + index=list("abcd"), + columns=MultiIndex.from_product([["Main"], ("another", "one")]), + ) + df["labels"] = "a" + df.loc[:, "labels"] = df.index + tm.assert_numpy_array_equal(np.asarray(df["labels"]), np.asarray(df.index)) + + # test with non-object block + df = DataFrame( + np.nan, + index=range(4), + columns=MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]), + ) + expected = df.copy() + df.loc[:, "B"] = np.arange(4) + expected.iloc[:, 2] = np.arange(4) + tm.assert_frame_equal(df, expected) + + +def test_loc_nan_multiindex(using_infer_string): + # GH 5286 + tups = [ + ("Good Things", "C", np.nan), + ("Good Things", "R", np.nan), + ("Bad Things", "C", np.nan), + ("Bad Things", "T", np.nan), + ("Okay Things", "N", "B"), + ("Okay Things", "N", "D"), + ("Okay Things", "B", np.nan), + ("Okay Things", "D", np.nan), + ] + df = DataFrame( + np.ones((8, 4)), + columns=Index(["d1", "d2", "d3", "d4"]), + index=MultiIndex.from_tuples(tups, names=["u1", "u2", "u3"]), + ) + result = df.loc["Good Things"].loc["C"] + expected = DataFrame( + np.ones((1, 4)), + index=Index( + [np.nan], + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + name="u3", + ), + columns=Index(["d1", "d2", "d3", "d4"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_period_string_indexing(): + # GH 9892 + a = pd.period_range("2013Q1", "2013Q4", freq="Q") + i = (1111, 2222, 3333) + idx = MultiIndex.from_product((a, i), names=("Period", "CVR")) + df = DataFrame( + index=idx, + columns=( + "OMS", + "OMK", + "RES", + "DRIFT_IND", + "OEVRIG_IND", + "FIN_IND", + "VARE_UD", + "LOEN_UD", + "FIN_UD", + ), + ) + result = df.loc[("2013Q1", 1111), "OMS"] + + alt = df.loc[(a[0], 1111), "OMS"] + assert np.isnan(alt) + + # Because the resolution of the string matches, it is an exact lookup, + # not a slice + assert np.isnan(result) + + alt = df.loc[("2013Q1", 1111), "OMS"] + assert np.isnan(alt) + + +def test_loc_datetime_mask_slicing(): + # GH 16699 + dt_idx = pd.to_datetime(["2017-05-04", "2017-05-05"]) + m_idx = MultiIndex.from_product([dt_idx, dt_idx], names=["Idx1", "Idx2"]) + df = DataFrame( + data=[[1, 2], [3, 4], [5, 6], [7, 6]], index=m_idx, columns=["C1", "C2"] + ) + result = df.loc[(dt_idx[0], (df.index.get_level_values(1) > "2017-05-04")), "C1"] + expected = Series( + [3], + name="C1", + index=MultiIndex.from_tuples( + [(pd.Timestamp("2017-05-04"), pd.Timestamp("2017-05-05"))], + names=["Idx1", "Idx2"], + ), + ) + tm.assert_series_equal(result, expected) + + +def test_loc_datetime_series_tuple_slicing(): + # https://github.com/pandas-dev/pandas/issues/35858 + date = pd.Timestamp("2000") + ser = Series( + 1, + index=MultiIndex.from_tuples([("a", date)], names=["a", "b"]), + name="c", + ) + result = ser.loc[:, [date]] + tm.assert_series_equal(result, ser) + + +def test_loc_with_mi_indexer(): + # https://github.com/pandas-dev/pandas/issues/35351 + df = DataFrame( + data=[["a", 1], ["a", 0], ["b", 1], ["c", 2]], + index=MultiIndex.from_tuples( + [(0, 1), (1, 0), (1, 1), (1, 1)], names=["index", "date"] + ), + columns=["author", "price"], + ) + idx = MultiIndex.from_tuples([(0, 1), (1, 1)], names=["index", "date"]) + result = df.loc[idx, :] + expected = DataFrame( + [["a", 1], ["b", 1], ["c", 2]], + index=MultiIndex.from_tuples([(0, 1), (1, 1), (1, 1)], names=["index", "date"]), + columns=["author", "price"], + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_mi_with_level1_named_0(): + # GH#37194 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + ser = Series(range(3), index=dti) + df = ser.to_frame() + df[1] = dti + + df2 = df.set_index(0, append=True) + assert df2.index.names == (None, 0) + df2.index.get_loc(dti[0]) # smoke test + + result = df2.loc[dti[0]] + expected = df2.iloc[[0]].droplevel(None) + tm.assert_frame_equal(result, expected) + + ser2 = df2[1] + assert ser2.index.names == (None, 0) + + result = ser2.loc[dti[0]] + expected = ser2.iloc[[0]].droplevel(None) + tm.assert_series_equal(result, expected) + + +def test_getitem_str_slice(): + # GH#15928 + df = DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"], + ], + columns="time,ticker,bid,ask".split(","), + ) + df2 = df.set_index(["ticker", "time"]).sort_index() + + res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0) + expected = df2.loc["AAPL"].loc[slice("2016-05-25 13:30:00"), :] + tm.assert_frame_equal(res, expected) + + +def test_3levels_leading_period_index(): + # GH#24091 + pi = pd.PeriodIndex( + ["20181101 1100", "20181101 1200", "20181102 1300", "20181102 1400"], + name="datetime", + freq="D", + ) + lev2 = ["A", "A", "Z", "W"] + lev3 = ["B", "C", "Q", "F"] + mi = MultiIndex.from_arrays([pi, lev2, lev3]) + + ser = Series(range(4), index=mi, dtype=np.float64) + result = ser.loc[(pi[0], "A", "B")] + assert result == 0.0 + + +class TestKeyErrorsWithMultiIndex: + def test_missing_keys_raises_keyerror(self): + # GH#27420 KeyError, not TypeError + df = DataFrame(np.arange(12).reshape(4, 3), columns=["A", "B", "C"]) + df2 = df.set_index(["A", "B"]) + + with pytest.raises(KeyError, match="1"): + df2.loc[(1, 6)] + + def test_missing_key_raises_keyerror2(self): + # GH#21168 KeyError, not "IndexingError: Too many indexers" + ser = Series(-1, index=MultiIndex.from_product([[0, 1]] * 2)) + + with pytest.raises(KeyError, match=r"\(0, 3\)"): + ser.loc[0, 3] + + def test_missing_key_combination(self): + # GH: 19556 + mi = MultiIndex.from_arrays( + [ + np.array(["a", "a", "b", "b"]), + np.array(["1", "2", "2", "3"]), + np.array(["c", "d", "c", "d"]), + ], + names=["one", "two", "three"], + ) + df = DataFrame(np.random.default_rng(2).random((4, 3)), index=mi) + msg = r"\('b', '1', slice\(None, None, None\)\)" + with pytest.raises(KeyError, match=msg): + df.loc[("b", "1", slice(None)), :] + with pytest.raises(KeyError, match=msg): + df.index.get_locs(("b", "1", slice(None))) + with pytest.raises(KeyError, match=r"\('b', '1'\)"): + df.loc[("b", "1"), :] + + +def test_getitem_loc_commutability(multiindex_year_month_day_dataframe_random_data): + df = multiindex_year_month_day_dataframe_random_data + ser = df["A"] + result = ser[2000, 5] + expected = df.loc[2000, 5]["A"] + tm.assert_series_equal(result, expected) + + +def test_loc_with_nan(): + # GH: 27104 + df = DataFrame( + {"col": [1, 2, 5], "ind1": ["a", "d", np.nan], "ind2": [1, 4, 5]} + ).set_index(["ind1", "ind2"]) + result = df.loc[["a"]] + expected = DataFrame( + {"col": [1]}, index=MultiIndex.from_tuples([("a", 1)], names=["ind1", "ind2"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.loc["a"] + expected = DataFrame({"col": [1]}, index=Index([1], name="ind2")) + tm.assert_frame_equal(result, expected) + + +def test_getitem_non_found_tuple(): + # GH: 25236 + df = DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]).set_index( + ["a", "b", "c"] + ) + with pytest.raises(KeyError, match=r"\(2\.0, 2\.0, 3\.0\)"): + df.loc[(2.0, 2.0, 3.0)] + + +def test_get_loc_datetime_index(): + # GH#24263 + index = pd.date_range("2001-01-01", periods=100) + mi = MultiIndex.from_arrays([index]) + # Check if get_loc matches for Index and MultiIndex + assert mi.get_loc("2001-01") == slice(0, 31, None) + assert index.get_loc("2001-01") == slice(0, 31, None) + + loc = mi[::2].get_loc("2001-01") + expected = index[::2].get_loc("2001-01") + assert loc == expected + + loc = mi.repeat(2).get_loc("2001-01") + expected = index.repeat(2).get_loc("2001-01") + assert loc == expected + + loc = mi.append(mi).get_loc("2001-01") + expected = index.append(index).get_loc("2001-01") + # TODO: standardize return type for MultiIndex.get_loc + tm.assert_numpy_array_equal(loc.nonzero()[0], expected) + + +def test_loc_setitem_indexer_differently_ordered(): + # GH#34603 + mi = MultiIndex.from_product([["a", "b"], [0, 1]]) + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=mi) + + indexer = ("a", [1, 0]) + df.loc[indexer, :] = np.array([[9, 10], [11, 12]]) + expected = DataFrame([[11, 12], [9, 10], [5, 6], [7, 8]], index=mi) + tm.assert_frame_equal(df, expected) + + +def test_loc_getitem_index_differently_ordered_slice_none(): + # GH#31330 + df = DataFrame( + [[1, 2], [3, 4], [5, 6], [7, 8]], + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=["a", "b"], + ) + result = df.loc[(slice(None), [2, 1]), :] + expected = DataFrame( + [[3, 4], [7, 8], [1, 2], [5, 6]], + index=[["a", "b", "a", "b"], [2, 2, 1, 1]], + columns=["a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("indexer", [[1, 2, 7, 6, 2, 3, 8, 7], [1, 2, 7, 6, 3, 8]]) +def test_loc_getitem_index_differently_ordered_slice_none_duplicates(indexer): + # GH#40978 + df = DataFrame( + [1] * 8, + index=MultiIndex.from_tuples( + [(1, 1), (1, 2), (1, 7), (1, 6), (2, 2), (2, 3), (2, 8), (2, 7)] + ), + columns=["a"], + ) + result = df.loc[(slice(None), indexer), :] + expected = DataFrame( + [1] * 8, + index=[[1, 1, 2, 1, 2, 1, 2, 2], [1, 2, 2, 7, 7, 6, 3, 8]], + columns=["a"], + ) + tm.assert_frame_equal(result, expected) + + result = df.loc[df.index.isin(indexer, level=1), :] + tm.assert_frame_equal(result, df) + + +def test_loc_getitem_drops_levels_for_one_row_dataframe(): + # GH#10521 "x" and "z" are both scalar indexing, so those levels are dropped + mi = MultiIndex.from_arrays([["x"], ["y"], ["z"]], names=["a", "b", "c"]) + df = DataFrame({"d": [0]}, index=mi) + expected = df.droplevel([0, 2]) + result = df.loc["x", :, "z"] + tm.assert_frame_equal(result, expected) + + ser = Series([0], index=mi) + result = ser.loc["x", :, "z"] + expected = Series([0], index=Index(["y"], name="b")) + tm.assert_series_equal(result, expected) + + +def test_mi_columns_loc_list_label_order(): + # GH 10710 + cols = MultiIndex.from_product([["A", "B", "C"], [1, 2]]) + df = DataFrame(np.zeros((5, 6)), columns=cols) + result = df.loc[:, ["B", "A"]] + expected = DataFrame( + np.zeros((5, 4)), + columns=MultiIndex.from_tuples([("B", 1), ("B", 2), ("A", 1), ("A", 2)]), + ) + tm.assert_frame_equal(result, expected) + + +def test_mi_partial_indexing_list_raises(): + # GH 13501 + frame = DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) + frame.index.names = ["key1", "key2"] + frame.columns.names = ["state", "color"] + with pytest.raises(KeyError, match="\\[2\\] not in index"): + frame.loc[["b", 2], "Colorado"] + + +def test_mi_indexing_list_nonexistent_raises(): + # GH 15452 + s = Series(range(4), index=MultiIndex.from_product([[1, 2], ["a", "b"]])) + with pytest.raises(KeyError, match="\\['not' 'found'\\] not in index"): + s.loc[["not", "found"]] + + +def test_mi_add_cell_missing_row_non_unique(): + # GH 16018 + result = DataFrame( + [[1, 2, 5, 6], [3, 4, 7, 8]], + index=["a", "a"], + columns=MultiIndex.from_product([[1, 2], ["A", "B"]]), + ) + result.loc["c"] = -1 + result.loc["c", (1, "A")] = 3 + result.loc["d", (1, "A")] = 3 + expected = DataFrame( + [ + [1.0, 2.0, 5.0, 6.0], + [3.0, 4.0, 7.0, 8.0], + [3.0, -1.0, -1, -1], + [3.0, np.nan, np.nan, np.nan], + ], + index=["a", "a", "c", "d"], + columns=MultiIndex.from_product([[1, 2], ["A", "B"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_get_scalar_casting_to_float(): + # GH#41369 + df = DataFrame( + {"a": 1.0, "b": 2}, index=MultiIndex.from_arrays([[3], [4]], names=["c", "d"]) + ) + result = df.loc[(3, 4), "b"] + assert result == 2 + assert isinstance(result, np.int64) + result = df.loc[[(3, 4)], "b"].iloc[0] + assert result == 2 + assert isinstance(result, np.int64) + + +def test_loc_empty_single_selector_with_names(): + # GH 19517 + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=[1, 0]) + s2 = Series(index=idx, dtype=np.float64) + result = s2.loc["a"] + expected = Series([np.nan, np.nan], index=Index(["A", "B"], name=0)) + tm.assert_series_equal(result, expected) + + +def test_loc_keyerror_rightmost_key_missing(): + # GH 20951 + + df = DataFrame( + { + "A": [100, 100, 200, 200, 300, 300], + "B": [10, 10, 20, 21, 31, 33], + "C": range(6), + } + ) + df = df.set_index(["A", "B"]) + with pytest.raises(KeyError, match="^1$"): + df.loc[(100, 1)] + + +def test_multindex_series_loc_with_tuple_label(): + # GH#43908 + mi = MultiIndex.from_tuples([(1, 2), (3, (4, 5))]) + ser = Series([1, 2], index=mi) + result = ser.loc[(3, (4, 5))] + assert result == 2 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py new file mode 100644 index 0000000000000000000000000000000000000000..36cc8316ea5ff4f7a5d264748e6c202d723129d9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py @@ -0,0 +1,235 @@ +import numpy as np +import pytest + +import pandas._libs.index as libindex +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + CategoricalDtype, + DataFrame, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype + + +class TestMultiIndexBasic: + def test_multiindex_perf_warn(self): + df = DataFrame( + { + "jim": [0, 0, 1, 1], + "joe": ["x", "x", "z", "y"], + "jolie": np.random.default_rng(2).random(4), + } + ).set_index(["jim", "joe"]) + + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(1, "z")] + + df = df.iloc[[2, 1, 3, 0]] + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(0,)] + + @pytest.mark.parametrize("offset", [-5, 5]) + def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset): + size_cutoff = 20 + n = size_cutoff + offset + + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 + + def test_multi_nan_indexing(self): + # GH 3588 + df = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + result = df.set_index(["a", "b"], drop=False) + expected = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + }, + index=[ + Index(["R1", "R2", np.nan, "R4"], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) + tm.assert_frame_equal(result, expected) + + def test_exclusive_nat_column_indexing(self): + # GH 38025 + # test multi indexing when one column exclusively contains NaT values + df = DataFrame( + { + "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + df = df.set_index(["a", "b"]) + expected = DataFrame( + { + "c": [10, 15, np.nan, 20], + }, + index=[ + Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) + tm.assert_frame_equal(df, expected) + + def test_nested_tuples_duplicates(self): + # GH#30892 + + dti = pd.to_datetime(["20190101", "20190101", "20190102"]) + idx = Index(["a", "a", "c"]) + mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"]) + + df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi) + + expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi) + + df2 = df.copy(deep=True) + df2.loc[(dti[0], "a"), "c2"] = 1.0 + tm.assert_frame_equal(df2, expected) + + df3 = df.copy(deep=True) + df3.loc[[(dti[0], "a")], "c2"] = 1.0 + tm.assert_frame_equal(df3, expected) + + def test_multiindex_with_datatime_level_preserves_freq(self): + # https://github.com/pandas-dev/pandas/issues/35563 + idx = Index(range(2), name="A") + dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") + mi = MultiIndex.from_product([idx, dti]) + df = DataFrame(np.random.default_rng(2).standard_normal((14, 2)), index=mi) + result = df.loc[0].index + tm.assert_index_equal(result, dti) + assert result.freq == dti.freq + + def test_multiindex_complex(self): + # GH#42145 + complex_data = [1 + 2j, 4 - 3j, 10 - 1j] + non_complex_data = [3, 4, 5] + result = DataFrame( + { + "x": complex_data, + "y": non_complex_data, + "z": non_complex_data, + } + ) + result.set_index(["x", "y"], inplace=True) + expected = DataFrame( + {"z": non_complex_data}, + index=MultiIndex.from_arrays( + [complex_data, non_complex_data], + names=("x", "y"), + ), + ) + tm.assert_frame_equal(result, expected) + + def test_rename_multiindex_with_duplicates(self): + # GH 38015 + mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")]) + df = DataFrame(index=mi) + df = df.rename(index={"A": "Apple"}, level=0) + + mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")]) + expected = DataFrame(index=mi2) + tm.assert_frame_equal(df, expected) + + def test_series_align_multiindex_with_nan_overlap_only(self): + # GH 38439 + mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]]) + ser1 = Series([1, 2], index=mi1) + ser2 = Series([1, 2], index=mi2) + result1, result2 = ser1.align(ser2) + + mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]]) + expected1 = Series([1.0, np.nan, 2.0], index=mi) + expected2 = Series([np.nan, 2.0, 1.0], index=mi) + + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) + + def test_series_align_multiindex_with_nan(self): + # GH 38439 + mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]]) + ser1 = Series([1, 2], index=mi1) + ser2 = Series([1, 2], index=mi2) + result1, result2 = ser1.align(ser2) + + mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + expected1 = Series([1, 2], index=mi) + expected2 = Series([2, 1], index=mi) + + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) + + def test_nunique_smoke(self): + # GH 34019 + n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique() + assert n == 1 + + def test_multiindex_repeated_keys(self): + # GH19414 + tm.assert_series_equal( + Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[ + ["a", "a", "b", "b"] + ], + Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])), + ) + + def test_multiindex_with_na_missing_key(self): + # GH46173 + df = DataFrame.from_dict( + { + ("foo",): [1, 2, 3], + ("bar",): [5, 6, 7], + (None,): [8, 9, 0], + } + ) + with pytest.raises(KeyError, match="missing_key"): + df[[("missing_key",)]] + + def test_multiindex_dtype_preservation(self): + # GH51261 + columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) + df = DataFrame(["value"], columns=columns).astype("category") + df_no_multiindex = df["A"] + assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype) + + # geopandas 1763 analogue + df = DataFrame( + [[1, 0], [0, 1]], + columns=[ + ["foo", "foo"], + ["location", "location"], + ["x", "y"], + ], + ).assign(bools=Series([True, False], dtype="boolean")) + assert isinstance(df["bools"].dtype, BooleanDtype) + + def test_multiindex_from_tuples_with_nan(self): + # GH#23578 + result = MultiIndex.from_tuples([("a", "b", "c"), np.nan, ("d", "", "")]) + expected = MultiIndex.from_tuples( + [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")] + ) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_partial.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_partial.py new file mode 100644 index 0000000000000000000000000000000000000000..fdf88b2a97e461702b63bbfad31905682ff66b35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_partial.py @@ -0,0 +1,269 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + DatetimeIndex, + MultiIndex, + date_range, +) +import pandas._testing as tm + + +class TestMultiIndexPartial: + def test_getitem_partial_int(self): + # GH 12416 + # with single item + l1 = [10, 20] + l2 = ["a", "b"] + df = DataFrame(index=range(2), columns=MultiIndex.from_product([l1, l2])) + expected = DataFrame(index=range(2), columns=l2) + result = df[20] + tm.assert_frame_equal(result, expected) + + # with list + expected = DataFrame( + index=range(2), columns=MultiIndex.from_product([l1[1:], l2]) + ) + result = df[[20]] + tm.assert_frame_equal(result, expected) + + # missing item: + with pytest.raises(KeyError, match="1"): + df[1] + with pytest.raises(KeyError, match=r"'\[1\] not in index'"): + df[[1]] + + def test_series_slice_partial(self): + pass + + def test_xs_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + result = frame.xs("foo") + result2 = frame.loc["foo"] + expected = frame.T["foo"].T + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + result = ymd.xs((2000, 4)) + expected = ymd.loc[2000, 4] + tm.assert_frame_equal(result, expected) + + # ex from #1796 + index = MultiIndex( + levels=[["foo", "bar"], ["one", "two"], [-1, 1]], + codes=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + ) + df = DataFrame( + np.random.default_rng(2).standard_normal((8, 4)), + index=index, + columns=list("abcd"), + ) + + result = df.xs(("foo", "one")) + expected = df.loc["foo", "one"] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + ymd = ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + tm.assert_frame_equal(result, expected) + + def test_fancy_slice_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): + frame = multiindex_dataframe_random_data + result = frame.loc["bar":"baz"] + expected = frame[3:7] + tm.assert_frame_equal(result, expected) + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[(2000, 2):(2000, 4)] + lev = ymd.index.codes[1] + expected = ymd[(lev >= 1) & (lev <= 3)] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex( + codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[["a", "b"], ["x", "y"], ["p", "q"]], + ) + df = DataFrame(np.random.default_rng(2).random((3, 2)), index=idx) + + result = df.loc[("a", "y"), :] + expected = df.loc[("a", "y")] + tm.assert_frame_equal(result, expected) + + result = df.loc[("a", "y"), [1, 0]] + expected = df.loc[("a", "y")][[1, 0]] + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): + df.loc[("a", "foo"), :] + + # TODO(ArrayManager) rewrite test to not use .values + # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view + @td.skip_array_manager_invalid_test + def test_partial_set( + self, + multiindex_year_month_day_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, + ): + # GH #397 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd.copy() + exp = ymd.copy() + df.loc[2000, 4] = 0 + exp.iloc[65:85] = 0 + tm.assert_frame_equal(df, exp) + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["A"].loc[2000, 4] = 1 + df.loc[(2000, 4), "A"] = 1 + else: + with tm.raises_chained_assignment_error(): + df["A"].loc[2000, 4] = 1 + exp.iloc[65:85, 0] = 1 + tm.assert_frame_equal(df, exp) + + df.loc[2000] = 5 + exp.iloc[:100] = 5 + tm.assert_frame_equal(df, exp) + + # this works...for now + with tm.raises_chained_assignment_error(): + df["A"].iloc[14] = 5 + if using_copy_on_write: + assert df["A"].iloc[14] == exp["A"].iloc[14] + else: + assert df["A"].iloc[14] == 5 + + @pytest.mark.parametrize("dtype", [int, float]) + def test_getitem_intkey_leading_level( + self, multiindex_year_month_day_dataframe_random_data, dtype + ): + # GH#33355 dont fall-back to positional when leading level is int + ymd = multiindex_year_month_day_dataframe_random_data + levels = ymd.index.levels + ymd.index = ymd.index.set_levels([levels[0].astype(dtype)] + levels[1:]) + ser = ymd["A"] + mi = ser.index + assert isinstance(mi, MultiIndex) + if dtype is int: + assert mi.levels[0].dtype == np.dtype(int) + else: + assert mi.levels[0].dtype == np.float64 + + assert 14 not in mi.levels[0] + assert not mi.levels[0]._should_fallback_to_positional + assert not mi._should_fallback_to_positional + + with pytest.raises(KeyError, match="14"): + ser[14] + + # --------------------------------------------------------------------- + + def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + expected = frame.copy() + result = frame.copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame.copy() + result = frame.copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_series_equal(result, expected) + + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "indexer, exp_idx, exp_values", + [ + ( + slice("2019-2", None), + DatetimeIndex(["2019-02-01"], dtype="M8[ns]"), + [2, 3], + ), + ( + slice(None, "2019-2"), + date_range("2019", periods=2, freq="MS"), + [0, 1, 2, 3], + ), + ], + ) + def test_partial_getitem_loc_datetime(self, indexer, exp_idx, exp_values): + # GH: 25165 + date_idx = date_range("2019", periods=2, freq="MS") + df = DataFrame( + list(range(4)), + index=MultiIndex.from_product([date_idx, [0, 1]], names=["x", "y"]), + ) + expected = DataFrame( + exp_values, + index=MultiIndex.from_product([exp_idx, [0, 1]], names=["x", "y"]), + ) + result = df[indexer] + tm.assert_frame_equal(result, expected) + result = df.loc[indexer] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis=0)[indexer] + tm.assert_frame_equal(result, expected) + + result = df.loc[indexer, :] + tm.assert_frame_equal(result, expected) + + df2 = df.swaplevel(0, 1).sort_index() + expected = expected.swaplevel(0, 1).sort_index() + + result = df2.loc[:, indexer, :] + tm.assert_frame_equal(result, expected) + + +def test_loc_getitem_partial_both_axis(): + # gh-12660 + iterables = [["a", "b"], [2, 1]] + columns = MultiIndex.from_product(iterables, names=["col1", "col2"]) + rows = MultiIndex.from_product(iterables, names=["row1", "row2"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), index=rows, columns=columns + ) + expected = df.iloc[:2, 2:].droplevel("row1").droplevel("col1", axis=1) + result = df.loc["a", "b"] + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_setitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_setitem.py new file mode 100644 index 0000000000000000000000000000000000000000..53ad4d6b41687e8e778710d1de22c19ebbfd3495 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_setitem.py @@ -0,0 +1,589 @@ +import numpy as np +import pytest + +from pandas.errors import SettingWithCopyError +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + date_range, + isna, + notna, +) +import pandas._testing as tm + + +def assert_equal(a, b): + assert a == b + + +class TestMultiIndexSetItem: + def check(self, target, indexers, value, compare_fn=assert_equal, expected=None): + target.loc[indexers] = value + result = target.loc[indexers] + if expected is None: + expected = value + compare_fn(result, expected) + + def test_setitem_multiindex(self): + # GH#7190 + cols = ["A", "w", "l", "a", "x", "X", "d", "profit"] + index = MultiIndex.from_product( + [np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"] + ) + t, n = 0, 2 + + df = DataFrame( + np.nan, + columns=cols, + index=index, + ) + self.check(target=df, indexers=((t, n), "X"), value=0) + + df = DataFrame(-999, columns=cols, index=index) + self.check(target=df, indexers=((t, n), "X"), value=1) + + df = DataFrame(columns=cols, index=index) + self.check(target=df, indexers=((t, n), "X"), value=2) + + # gh-7218: assigning with 0-dim arrays + df = DataFrame(-999, columns=cols, index=index) + self.check( + target=df, + indexers=((t, n), "X"), + value=np.array(3), + expected=3, + ) + + def test_setitem_multiindex2(self): + # GH#5206 + df = DataFrame( + np.arange(25).reshape(5, 5), columns="A,B,C,D,E".split(","), dtype=float + ) + df["F"] = 99 + row_selection = df["A"] % 2 == 0 + col_selection = ["B", "C"] + df.loc[row_selection, col_selection] = df["F"] + output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"]) + tm.assert_frame_equal(df.loc[row_selection, col_selection], output) + self.check( + target=df, + indexers=(row_selection, col_selection), + value=df["F"], + compare_fn=tm.assert_frame_equal, + expected=output, + ) + + def test_setitem_multiindex3(self): + # GH#11372 + idx = MultiIndex.from_product( + [["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")] + ) + cols = MultiIndex.from_product( + [["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")] + ) + + df = DataFrame( + np.random.default_rng(2).random((12, 4)), index=idx, columns=cols + ) + + subidx = MultiIndex.from_arrays( + [["A", "A"], date_range("2015-01-01", "2015-02-01", freq="MS")] + ) + subcols = MultiIndex.from_arrays( + [["foo", "foo"], date_range("2016-01-01", "2016-02-01", freq="MS")] + ) + + vals = DataFrame( + np.random.default_rng(2).random((2, 2)), index=subidx, columns=subcols + ) + self.check( + target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # set all columns + vals = DataFrame( + np.random.default_rng(2).random((2, 4)), index=subidx, columns=cols + ) + self.check( + target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # identity + copy = df.copy() + self.check( + target=df, + indexers=(df.index, df.columns), + value=df, + compare_fn=tm.assert_frame_equal, + expected=copy, + ) + + # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in + # all NaNs -> doesn't work in the "split" path (also for BlockManager actually) + @td.skip_array_manager_not_yet_implemented + def test_multiindex_setitem(self): + # GH 3738 + # setting with a multi-index right hand side + arrays = [ + np.array(["bar", "bar", "baz", "qux", "qux", "bar"]), + np.array(["one", "two", "one", "one", "two", "one"]), + np.arange(0, 6, 1), + ] + + df_orig = DataFrame( + np.random.default_rng(2).standard_normal((6, 3)), + index=arrays, + columns=["A", "B", "C"], + ).sort_index() + + expected = df_orig.loc[["bar"]] * 2 + df = df_orig.copy() + df.loc[["bar"]] *= 2 + tm.assert_frame_equal(df.loc[["bar"]], expected) + + # raise because these have differing levels + msg = "cannot align on a multi-index with out specifying the join levels" + with pytest.raises(TypeError, match=msg): + df.loc["bar"] *= 2 + + def test_multiindex_setitem2(self): + # from SO + # https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + df_orig = DataFrame.from_dict( + { + "price": { + ("DE", "Coal", "Stock"): 2, + ("DE", "Gas", "Stock"): 4, + ("DE", "Elec", "Demand"): 1, + ("FR", "Gas", "Stock"): 5, + ("FR", "Solar", "SupIm"): 0, + ("FR", "Wind", "SupIm"): 0, + } + } + ) + df_orig.index = MultiIndex.from_tuples( + df_orig.index, names=["Sit", "Com", "Type"] + ) + + expected = df_orig.copy() + expected.iloc[[0, 1, 3]] *= 2 + + idx = pd.IndexSlice + df = df_orig.copy() + df.loc[idx[:, :, "Stock"], :] *= 2 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, :, "Stock"], "price"] *= 2 + tm.assert_frame_equal(df, expected) + + def test_multiindex_assignment(self): + # GH3777 part 2 + + # mixed dtype + df = DataFrame( + np.random.default_rng(2).integers(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) + df["d"] = np.nan + arr = np.array([0.0, 1.0]) + + df.loc[4, "d"] = arr + tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) + + def test_multiindex_assignment_single_dtype( + self, using_copy_on_write, warn_copy_on_write + ): + # GH3777 part 2b + # single dtype + arr = np.array([0.0, 1.0]) + + df = DataFrame( + np.random.default_rng(2).integers(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + dtype=np.int64, + ) + view = df["c"].iloc[:2].values + + # arr can be losslessly cast to int, so this setitem is inplace + # INFO(CoW-warn) this does not warn because we directly took .values + # above, so no reference to a pandas object is alive for `view` + df.loc[4, "c"] = arr + exp = Series(arr, index=[8, 10], name="c", dtype="int64") + result = df.loc[4, "c"] + tm.assert_series_equal(result, exp) + + # extra check for inplace-ness + if not using_copy_on_write: + tm.assert_numpy_array_equal(view, exp.values) + + # arr + 0.5 cannot be cast losslessly to int, so we upcast + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + df.loc[4, "c"] = arr + 0.5 + result = df.loc[4, "c"] + exp = exp + 0.5 + tm.assert_series_equal(result, exp) + + # scalar ok + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, "c"] = 10 + exp = Series(10, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.loc[4, "c"], exp) + + # invalid assignments + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): + df.loc[4, "c"] = [0, 1, 2, 3] + + with pytest.raises(ValueError, match=msg): + df.loc[4, "c"] = [0] + + # But with a length-1 listlike column indexer this behaves like + # `df.loc[4, "c"] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, ["c"]] = [0] + assert (df.loc[4, "c"] == 0).all() + + def test_groupby_example(self): + # groupby example + NUM_ROWS = 100 + NUM_COLS = 10 + col_names = ["A" + num for num in map(str, np.arange(NUM_COLS).tolist())] + index_cols = col_names[:5] + + df = DataFrame( + np.random.default_rng(2).integers(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, + columns=col_names, + ) + df = df.set_index(index_cols).sort_index() + grp = df.groupby(level=index_cols[:4]) + df["new_col"] = np.nan + + # we are actually operating on a copy here + # but in this case, that's ok + for name, df2 in grp: + new_vals = np.arange(df2.shape[0]) + df.loc[name, "new_col"] = new_vals + + def test_series_setitem( + self, multiindex_year_month_day_dataframe_random_data, warn_copy_on_write + ): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd["A"] + + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3] = np.nan + assert isna(s.values[42:65]).all() + assert notna(s.values[:42]).all() + assert notna(s.values[65:]).all() + + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3, 10] = np.nan + assert isna(s.iloc[49]) + + with pytest.raises(KeyError, match="49"): + # GH#33355 dont fall-back to positional when leading level is int + s[49] + + def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + values = df.values.copy() + + result = df[df > 0] + expected = df.where(df > 0) + tm.assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + tm.assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + tm.assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + tm.assert_almost_equal(df.values, values) + + with pytest.raises(TypeError, match="boolean values only"): + df[df * 0] = 2 + + def test_frame_getitem_setitem_multislice(self): + levels = [["t1", "t2"], ["a", "b", "c"]] + codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"]) + df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) + + result = df.loc[:, "value"] + tm.assert_series_equal(df["value"], result) + + result = df.loc[df.index[1:3], "value"] + tm.assert_series_equal(df["value"][1:3], result) + + result = df.loc[:, :] + tm.assert_frame_equal(df, result) + + result = df + df.loc[:, "value"] = 10 + result["value"] = 10 + tm.assert_frame_equal(df, result) + + df.loc[:, :] = 10 + tm.assert_frame_equal(df, result) + + def test_frame_setitem_multi_column(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=[["a", "a", "b", "b"], [0, 1, 0, 1]], + ) + + cp = df.copy() + cp["a"] = cp["b"] + tm.assert_frame_equal(cp["a"], cp["b"]) + + # set with ndarray + cp = df.copy() + cp["a"] = cp["b"].values + tm.assert_frame_equal(cp["a"], cp["b"]) + + def test_frame_setitem_multi_column2(self): + # --------------------------------------- + # GH#1803 + columns = MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]) + df = DataFrame(index=[1, 3, 5], columns=columns) + + # Works, but adds a column instead of updating the two existing ones + df["A"] = 0.0 # Doesn't work + assert (df["A"].values == 0).all() + + # it broadcasts + df["B", "1"] = [1, 2, 3] + df["A"] = df["B", "1"] + + sliced_a1 = df["A", "1"] + sliced_a2 = df["A", "2"] + sliced_b1 = df["B", "1"] + tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) + assert sliced_a1.name == ("A", "1") + assert sliced_a2.name == ("A", "2") + assert sliced_b1.name == ("B", "1") + + def test_loc_getitem_tuple_plus_columns( + self, multiindex_year_month_day_dataframe_random_data + ): + # GH #1013 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd[:5] + + result = df.loc[(2000, 1, 6), ["A", "B", "C"]] + expected = df.loc[2000, 1, 6][["A", "B", "C"]] + tm.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + def test_loc_getitem_setitem_slice_integers(self, frame_or_series): + index = MultiIndex( + levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] + ) + + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 4)), + index=index, + columns=["a", "b", "c", "d"], + ) + obj = tm.get_obj(obj, frame_or_series) + + res = obj.loc[1:2] + exp = obj.reindex(obj.index[2:]) + tm.assert_equal(res, exp) + + obj.loc[1:2] = 7 + assert (obj.loc[1:2] == 7).values.all() + + def test_setitem_change_dtype(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + dft = frame.T + s = dft["foo", "two"] + dft["foo", "two"] = s > s.median() + tm.assert_series_equal(dft["foo", "two"], s > s.median()) + # assert isinstance(dft._data.blocks[1].items, MultiIndex) + + reindexed = dft.reindex(columns=[("foo", "two")]) + tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) + + def test_set_column_scalar_with_loc( + self, multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write + ): + frame = multiindex_dataframe_random_data + subset = frame.index[[1, 4, 5]] + + frame.loc[subset] = 99 + assert (frame.loc[subset].values == 99).all() + + frame_original = frame.copy() + col = frame["B"] + with tm.assert_cow_warning(warn_copy_on_write): + col[subset] = 97 + if using_copy_on_write: + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) + else: + assert (frame.loc[subset, "B"] == 97).all() + + def test_nonunique_assignment_1750(self): + df = DataFrame( + [[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD") + ) + + df = df.set_index(["A", "B"]) + mi = MultiIndex.from_tuples([(1, 1)]) + + df.loc[mi, "C"] = "_" + + assert (df.xs((1, 1))["C"] == "_").all() + + def test_astype_assignment_with_dups(self): + # GH 4686 + # assignment with dups that has a dtype change + cols = MultiIndex.from_tuples([("A", "1"), ("B", "1"), ("A", "2")]) + df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object) + index = df.index.copy() + + df["A"] = df["A"].astype(np.float64) + tm.assert_index_equal(df.index, index) + + def test_setitem_nonmonotonic(self): + # https://github.com/pandas-dev/pandas/issues/31449 + index = MultiIndex.from_tuples( + [("a", "c"), ("b", "x"), ("a", "d")], names=["l1", "l2"] + ) + df = DataFrame(data=[0, 1, 2], index=index, columns=["e"]) + df.loc["a", "e"] = np.arange(99, 101, dtype="int64") + expected = DataFrame({"e": [99, 1, 100]}, index=index) + tm.assert_frame_equal(df, expected) + + +class TestSetitemWithExpansionMultiIndex: + def test_setitem_new_column_mixed_depth(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + result = df.copy() + expected = df.copy() + result["b"] = [1, 2, 3, 4] + expected["b", "", ""] = [1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_setitem_new_column_all_na(self): + # GH#1534 + mix = MultiIndex.from_tuples([("1a", "2a"), ("1a", "2b"), ("1a", "2c")]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) + s = Series({(1, 1): 1, (1, 2): 2}) + df["new"] = s + assert df["new"].isna().all() + + def test_setitem_enlargement_keep_index_names(self): + # GH#53053 + mi = MultiIndex.from_tuples([(1, 2, 3)], names=["i1", "i2", "i3"]) + df = DataFrame(data=[[10, 20, 30]], index=mi, columns=["A", "B", "C"]) + df.loc[(0, 0, 0)] = df.loc[(1, 2, 3)] + mi_expected = MultiIndex.from_tuples( + [(1, 2, 3), (0, 0, 0)], names=["i1", "i2", "i3"] + ) + expected = DataFrame( + data=[[10, 20, 30], [10, 20, 30]], + index=mi_expected, + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(df, expected) + + +@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values +# is not a view +def test_frame_setitem_view_direct( + multiindex_dataframe_random_data, using_copy_on_write +): + # this works because we are modifying the underlying array + # really a no-no + df = multiindex_dataframe_random_data.T + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + df["foo"].values[:] = 0 + assert (df["foo"].values != 0).all() + else: + df["foo"].values[:] = 0 + assert (df["foo"].values == 0).all() + + +def test_frame_setitem_copy_raises( + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write +): + # will raise/warn as its chained assignment + df = multiindex_dataframe_random_data.T + if using_copy_on_write or warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + + +def test_frame_setitem_copy_no_write( + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write +): + frame = multiindex_dataframe_random_data.T + expected = frame + df = frame.copy() + if using_copy_on_write or warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + + result = df + tm.assert_frame_equal(result, expected) + + +def test_frame_setitem_partial_multiindex(): + # GH 54875 + df = DataFrame( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": 6, + "d": 7, + } + ).set_index(["a", "b", "c"]) + ser = Series(8, index=df.index.droplevel("c")) + result = df.copy() + result["d"] = ser + expected = df.copy() + expected["d"] = 8 + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_slice.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_slice.py new file mode 100644 index 0000000000000000000000000000000000000000..cef3dca054758eb8c4926455c449d73d63c0dc63 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_slice.py @@ -0,0 +1,796 @@ +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas.errors import UnsortedIndexError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.tests.indexing.common import _mklbl + + +class TestMultiIndexSlicers: + def test_per_axis_per_level_getitem(self): + # GH6134 + # example test case + ix = MultiIndex.from_product( + [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)] + ) + df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) + + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if a in ("A1", "A2", "A3") and c in ("C1", "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if a in ("A1", "A2", "A3") and c in ("C1", "C2", "C3") + ] + ] + result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] + tm.assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + + df = DataFrame( + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) + df = df.sort_index(axis=0).sort_index(axis=1) + + # identity + result = df.loc[(slice(None), slice(None)), :] + tm.assert_frame_equal(result, df) + result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + result = df.loc[:, (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), 1), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # columns + result = df.loc[:, (slice(None), ["foo"])] + expected = df.iloc[:, [1, 3]] + tm.assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None), 1), (slice(None), ["foo"])] + expected = df.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc["A", "a"] + expected = DataFrame( + {"bar": [1, 5, 9], "foo": [0, 4, 8]}, + index=Index([1, 2, 3], name="two"), + columns=Index(["bar", "foo"], name="lvl1"), + ) + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), [1, 2]), :] + expected = df.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.to_numpy())), index=ix) + result = s.loc["A1":"A3", :, ["C1", "C3"]] + expected = s.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in s.index.values + if a in ("A1", "A2", "A3") and c in ("C1", "C3") + ] + ] + tm.assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(result, expected) + + msg = ( + "cannot index with a boolean indexer " + "that is not the same length as the index" + ) + with pytest.raises(ValueError, match=msg): + df.loc[(slice(None), np.array([True, False])), :] + + with pytest.raises(KeyError, match=r"\[1\] not in index"): + # slice(None) is on the index, [1] is on the columns, but 1 is + # not in the columns, so we raise + # This used to treat [1] as positional GH#16396 + df.loc[slice(None), [1]] + + # not lexsorted + assert df.index._lexsort_depth == 2 + df = df.sort_index(level=1, axis=0) + assert df.index._lexsort_depth == 0 + + msg = ( + "MultiIndex slicing requires the index to be " + r"lexsorted: slicing on levels \[1\], lexsort depth 0" + ) + with pytest.raises(UnsortedIndexError, match=msg): + df.loc[(slice(None), slice("bar")), :] + + # GH 16734: not sorted, but no real slicing + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] + tm.assert_frame_equal(result, df.iloc[[1, 3], :]) + + def test_multiindex_slicers_non_unique(self): + # GH 7106 + # non-unique mi index support + df = ( + DataFrame( + { + "A": ["foo", "foo", "foo", "foo"], + "B": ["a", "a", "a", "a"], + "C": [1, 2, 1, 3], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + assert not df.index.is_unique + expected = ( + DataFrame({"A": ["foo", "foo"], "B": ["a", "a"], "C": [1, 1], "D": [1, 3]}) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.loc[(slice(None), slice(None), 1), :] + tm.assert_frame_equal(result, expected) + + # this is equivalent of an xs expression + result = df.xs(1, level=2, drop_level=False) + tm.assert_frame_equal(result, expected) + + df = ( + DataFrame( + { + "A": ["foo", "foo", "foo", "foo"], + "B": ["a", "a", "a", "a"], + "C": [1, 2, 1, 2], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + assert not df.index.is_unique + expected = ( + DataFrame({"A": ["foo", "foo"], "B": ["a", "a"], "C": [1, 1], "D": [1, 3]}) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.loc[(slice(None), slice(None), 1), :] + assert not result.index.is_unique + tm.assert_frame_equal(result, expected) + + # GH12896 + # numpy-implementation dependent bug + ints = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 12, + 13, + 14, + 14, + 16, + 17, + 18, + 19, + 200000, + 200000, + ] + n = len(ints) + idx = MultiIndex.from_arrays([["a"] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + tm.assert_series_equal(result, expected) + + def test_multiindex_slicers_datetimelike(self): + # GH 7429 + # buggy/inconsistent behavior when slicing with datetime-like + dates = [datetime(2012, 1, 1, 12, 12, 12) + timedelta(days=i) for i in range(6)] + freq = [1, 2] + index = MultiIndex.from_product([dates, freq], names=["date", "frequency"]) + + df = DataFrame( + np.arange(6 * 2 * 4, dtype="int64").reshape(-1, 4), + index=index, + columns=list("ABCD"), + ) + + # multi-axis slicing + idx = pd.IndexSlice + expected = df.iloc[[0, 2, 4], [0, 1]] + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + slice(1, 1), + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + idx[ + Timestamp("2012-01-01 12:12:12") : Timestamp("2012-01-03 12:12:12") + ], + idx[1:1], + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + 1, + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + # with strings + result = df.loc[ + (slice("2012-01-01 12:12:12", "2012-01-03 12:12:12"), slice(1, 1)), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + (idx["2012-01-01 12:12:12":"2012-01-03 12:12:12"], 1), idx["A", "B"] + ] + tm.assert_frame_equal(result, expected) + + def test_multiindex_slicers_edges(self): + # GH 8132 + # various edge cases + df = DataFrame( + { + "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5, + "B": ["B0", "B0", "B1", "B1", "B2"] * 3, + "DATE": [ + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-09-03", + "2013-10-01", + "2013-07-09", + "2013-08-06", + "2013-09-03", + ], + "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2], + } + ) + + df["DATE"] = pd.to_datetime(df["DATE"]) + df1 = df.set_index(["A", "B", "DATE"]) + df1 = df1.sort_index() + + # A1 - Get all values under "A0" and "A1" + result = df1.loc[(slice("A1")), :] + expected = df1.iloc[0:10] + tm.assert_frame_equal(result, expected) + + # A2 - Get all values from the start to "A2" + result = df1.loc[(slice("A2")), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # A3 - Get all values under "B1" or "B2" + result = df1.loc[(slice(None), slice("B1", "B2")), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] + tm.assert_frame_equal(result, expected) + + # A4 - Get all values between 2013-07-02 and 2013-07-09 + result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + # B1 - Get all values in B0 that are also under A0, A1 and A2 + result = df1.loc[(slice("A2"), slice("B0")), :] + expected = df1.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for + # the As) + result = df1.loc[(slice(None), slice("B2")), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # B3 - Get all values from B1 to B2 and up to 2013-08-06 + result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] + tm.assert_frame_equal(result, expected) + + # B4 - Same as A4 but the start of the date slice is not a key. + # shows indexing on a partial selection slice + result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + def test_per_axis_per_level_doc_examples(self): + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if a in ("A1", "A2", "A3") and c in ("C1", "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if c in ("C1", "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + # not sorted + msg = ( + "MultiIndex slicing requires the index to be lexsorted: " + r"slicing on levels \[1\], lexsort depth 1" + ) + with pytest.raises(UnsortedIndexError, match=msg): + df.loc["A1", ("a", slice("foo"))] + + # GH 16734: not sorted, but no real slicing + tm.assert_frame_equal( + df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]] + ) + + df = df.sort_index(axis=1) + + # slicing + df.loc["A1", (slice(None), "foo")] + df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] + + # setitem + df.loc(axis=0)[:, :, ["C1", "C3"]] = -10 + + def test_loc_axis_arguments(self): + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = ( + DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + .sort_index() + .sort_index(axis=1) + ) + + # axis 0 + result = df.loc(axis=0)["A1":"A3", :, ["C1", "C3"]] + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if a in ("A1", "A2", "A3") and c in ("C1", "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="index")[:, :, ["C1", "C3"]] + expected = df.loc[ + [ + ( + a, + b, + c, + d, + ) + for a, b, c, d in df.index.values + if c in ("C1", "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="columns")[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] + tm.assert_frame_equal(result, expected) + + # invalid axis + for i in [-1, 2, "foo"]: + msg = f"No axis named {i} for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.loc(axis=i)[:, :, ["C1", "C3"]] + + def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): + # GH29519 + df = DataFrame( + np.arange(27).reshape(3, 9), + columns=MultiIndex.from_product([["a1", "a2", "a3"], ["b1", "b2", "b3"]]), + ) + result = df.loc(axis=1)["a1":"a2"] + expected = df.iloc[:, :-3] + + tm.assert_frame_equal(result, expected) + + def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): + # GH29519 + df = DataFrame( + np.arange(27).reshape(3, 9), + columns=MultiIndex.from_product([["a1", "a2", "a3"], ["b1", "b2", "b3"]]), + ) + result = df.loc(axis=1)["a1"] + expected = df.iloc[:, :3] + expected.columns = ["b1", "b2", "b3"] + + tm.assert_frame_equal(result, expected) + + def test_loc_ax_single_level_indexer_simple_df(self): + # GH29519 + # test single level indexing on single index column data frame + df = DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + result = df.loc(axis=1)["a"] + expected = Series(np.array([0, 3, 6]), name="a") + tm.assert_series_equal(result, expected) + + def test_per_axis_per_level_setitem(self): + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + + df_orig = DataFrame( + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) + df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None), slice(None)), :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:, (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None), [1]), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, 1] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:, (slice(None), ["foo"])] = 100 + expected = df_orig.copy() + expected.iloc[:, [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc["A", "a"] = 100 + expected = df_orig.copy() + expected.iloc[0:3, 0:2] = 100 + tm.assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100, 100], [100, 100]], dtype="int64" + ) + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + + msg = "setting an array element with a sequence." + with pytest.raises(ValueError, match=msg): + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100], [100, 100]], dtype="int64" + ) + + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [100, 100, 100, 100], dtype="int64" + ) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = ( + df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5 + ) + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[ + (slice(None), 1), (slice(None), ["foo"]) + ] + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() + rhs.loc[:, ("c", "bah")] = 10 + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + def test_multiindex_label_slicing_with_negative_step(self): + ser = Series( + np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)]) + ) + SLC = pd.IndexSlice + + tm.assert_indexing_slices_equivalent(ser, SLC[::-1], SLC[::-1]) + + tm.assert_indexing_slices_equivalent(ser, SLC["d"::-1], SLC[15::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",)::-1], SLC[15::-1]) + + tm.assert_indexing_slices_equivalent(ser, SLC[:"d":-1], SLC[:11:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:("d",):-1], SLC[:11:-1]) + + tm.assert_indexing_slices_equivalent(ser, SLC["d":"b":-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",):"b":-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d":("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",):("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["b":"d":-1], SLC[:0]) + + tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2)::-1], SLC[10::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:("c", 2):-1], SLC[:9:-1]) + tm.assert_indexing_slices_equivalent( + ser, SLC[("e", 0):("c", 2):-1], SLC[16:9:-1] + ) + + def test_multiindex_slice_first_level(self): + # GH 12697 + freq = ["a", "b", "c", "d"] + idx = MultiIndex.from_product([freq, range(500)]) + df = DataFrame(list(range(2000)), index=idx, columns=["Test"]) + df_slice = df.loc[pd.IndexSlice[:, 30:70], :] + result = df_slice.loc["a"] + expected = DataFrame(list(range(30, 71)), columns=["Test"], index=range(30, 71)) + tm.assert_frame_equal(result, expected) + result = df_slice.loc["d"] + expected = DataFrame( + list(range(1530, 1571)), columns=["Test"], index=range(30, 71) + ) + tm.assert_frame_equal(result, expected) + + def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd["A"] + result = s[5:] + expected = s.reindex(s.index[5:]) + tm.assert_series_equal(result, expected) + + s = ymd["A"].copy() + exp = ymd["A"].copy() + s[5:] = 0 + exp.iloc[5:] = 0 + tm.assert_numpy_array_equal(s.values, exp.values) + + result = ymd[5:] + expected = ymd.reindex(s.index[5:]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, loc, iloc", + [ + # dtype = int, step = -1 + ("int", slice(None, None, -1), slice(None, None, -1)), + ("int", slice(3, None, -1), slice(3, None, -1)), + ("int", slice(None, 1, -1), slice(None, 0, -1)), + ("int", slice(3, 1, -1), slice(3, 0, -1)), + # dtype = int, step = -2 + ("int", slice(None, None, -2), slice(None, None, -2)), + ("int", slice(3, None, -2), slice(3, None, -2)), + ("int", slice(None, 1, -2), slice(None, 0, -2)), + ("int", slice(3, 1, -2), slice(3, 0, -2)), + # dtype = str, step = -1 + ("str", slice(None, None, -1), slice(None, None, -1)), + ("str", slice("d", None, -1), slice(3, None, -1)), + ("str", slice(None, "b", -1), slice(None, 0, -1)), + ("str", slice("d", "b", -1), slice(3, 0, -1)), + # dtype = str, step = -2 + ("str", slice(None, None, -2), slice(None, None, -2)), + ("str", slice("d", None, -2), slice(3, None, -2)), + ("str", slice(None, "b", -2), slice(None, 0, -2)), + ("str", slice("d", "b", -2), slice(3, 0, -2)), + ], + ) + def test_loc_slice_negative_stepsize(self, dtype, loc, iloc): + # GH#38071 + labels = { + "str": list("abcde"), + "int": range(5), + }[dtype] + + mi = MultiIndex.from_arrays([labels] * 2) + df = DataFrame(1.0, index=mi, columns=["A"]) + + SLC = pd.IndexSlice + + expected = df.iloc[iloc, :] + result_get_loc = df.loc[SLC[loc], :] + result_get_locs_level_0 = df.loc[SLC[loc, :], :] + result_get_locs_level_1 = df.loc[SLC[:, loc], :] + + tm.assert_frame_equal(result_get_loc, expected) + tm.assert_frame_equal(result_get_locs_level_0, expected) + tm.assert_frame_equal(result_get_locs_level_1, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_sorted.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_sorted.py new file mode 100644 index 0000000000000000000000000000000000000000..cf3fa5296c97c313292a0581cb776931c121fd52 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexing/multiindex/test_sorted.py @@ -0,0 +1,153 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + DataFrame, + MultiIndex, + Series, + array, +) +import pandas._testing as tm + + +class TestMultiIndexSorted: + def test_getitem_multilevel_index_tuple_not_sorted(self): + index_columns = list("abc") + df = DataFrame( + [[0, 1, 0, "x"], [0, 0, 1, "y"]], columns=index_columns + ["data"] + ) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=["a", "b", "c"]) + xp = Series(["x"], index=xp_idx, name="data") + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, : np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("key", [None, lambda x: x]) + def test_frame_getitem_not_sorted2(self, key): + # 13431 + df = DataFrame( + { + "col1": ["b", "d", "b", "a"], + "col2": [3, 1, 1, 2], + "data": ["one", "two", "three", "four"], + } + ) + + df2 = df.set_index(["col1", "col2"]) + df2_original = df2.copy() + + df2.index = df2.index.set_levels(["b", "d", "a"], level="col1") + df2.index = df2.index.set_codes([0, 1, 0, 2], level="col1") + assert not df2.index.is_monotonic_increasing + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index(key=key) + assert expected.index.is_monotonic_increasing + + result = df2.sort_index(level=0, key=key) + assert result.index.is_monotonic_increasing + tm.assert_frame_equal(result, expected) + + def test_sort_values_key(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + index = index.sort_values( # sort by third letter + key=lambda x: x.map(lambda entry: entry[2]) + ) + result = DataFrame(range(8), index=index) + + arrays = [ + ["foo", "foo", "bar", "bar", "qux", "qux", "baz", "baz"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + expected = DataFrame(range(8), index=index) + + tm.assert_frame_equal(result, expected) + + def test_argsort_with_na(self): + # GH48495 + arrays = [ + array([2, NA, 1], dtype="Int64"), + array([1, 2, 3], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + result = index.argsort() + expected = np.array([2, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_sort_values_with_na(self): + # GH48495 + arrays = [ + array([2, NA, 1], dtype="Int64"), + array([1, 2, 3], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + index = index.sort_values() + result = DataFrame(range(3), index=index) + + arrays = [ + array([1, 2, NA], dtype="Int64"), + array([3, 1, 2], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + expected = DataFrame(range(3), index=index) + + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + df["foo", "four"] = "foo" + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df["foo"] + result2 = df.loc[:, "foo"] + expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs("foo") + result2 = df.loc["foo"] + expected = df.reindex(df.index[arrays[0] == "foo"]) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(np.random.default_rng(2).standard_normal(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s["qux"] + result2 = s.loc["qux"] + expected = s[arrays[0] == "qux"] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a89fc1dc73a3389698b754cdd4eb07ac4c338e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_impl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_impl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f3b8c796755336a8281d59ac275e0045e71a7e6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_impl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_spec_conformance.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_spec_conformance.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47b244036a2e3fbabf6fc53c875c0d55c181e5a3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_spec_conformance.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b06eb756c46862a5adafa581900a19afd4ade5ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/interchange/__pycache__/test_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f42ab23aeff198ce9a29b95f8cfd96900420b1d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c02b0c8b49d86f86ba2a13802b3653eca5019aa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_internals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_internals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7db19312b7c2a756b314d6c398ec550ddb59166 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_internals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_managers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_managers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09b37b430c3612b71db084f9e74a321b3ef6f03e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/internals/__pycache__/test_managers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97d3ffaee6841cee16efa590d21e8a1fb178cfa1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2163c752a5033894ce244477c278d384b2f4a61f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/generate_legacy_storage_files.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/generate_legacy_storage_files.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c8aa957bf020ff84cce4b009d5705ed4465c8b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/generate_legacy_storage_files.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_clipboard.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_clipboard.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f78953806d29a17b23533cd43768f8cbb6f33763 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_clipboard.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f03fc81089e917b840d555fa20f0224b782bf6af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_compression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_compression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e636e0c234f1a4a086ebd37d6eab0967952a9132 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_compression.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_feather.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_feather.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20cf4b8d40d86be71c064bc76816cb19df1dd8f5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_feather.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_fsspec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_fsspec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebb69ba192ffe765d207b81313bf9f73508396d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_fsspec.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gbq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gbq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..808a9b50e1878f58d230e698cda3e6bc042be1a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gbq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gcs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gcs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ed8ee2d6b54906dc91237e813cb274746684337 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_gcs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_html.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_html.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9d91fa17d84ca5c1a3ce7118222f5ec5cb23bf4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_html.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_http_headers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_http_headers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..224a57393c912463739fe9e234192d7f5d537a45 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_http_headers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_orc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_orc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69d498907fe456f557828ce43b7882a7144fc59a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_orc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_parquet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_parquet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75849dd2901bae2afbd44edbd002d60644d26c65 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_parquet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c182df4e6a9ef9da8d287e528de84908d87c62f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_s3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_s3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..785291b028f4740d595b29a9b1ca60f5f0bc2897 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_s3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_spss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_spss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f05f3b1a7a48d92eba35ca72ccdc7e189f4ce10f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/__pycache__/test_spss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odf.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odf.py new file mode 100644 index 0000000000000000000000000000000000000000..b5bb9b27258d86cda6e44aeae17a4cdba4157a43 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odf.py @@ -0,0 +1,77 @@ +import functools + +import numpy as np +import pytest + +from pandas.compat import is_platform_windows + +import pandas as pd +import pandas._testing as tm + +pytest.importorskip("odf") + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture(autouse=True) +def cd_and_set_engine(monkeypatch, datapath): + func = functools.partial(pd.read_excel, engine="odf") + monkeypatch.setattr(pd, "read_excel", func) + monkeypatch.chdir(datapath("io", "data", "excel")) + + +def test_read_invalid_types_raises(): + # the invalid_value_type.ods required manually editing + # of the included content.xml file + with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"): + pd.read_excel("invalid_value_type.ods") + + +def test_read_writer_table(): + # Also test reading tables from an text OpenDocument file + # (.odt) + index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") + expected = pd.DataFrame( + [[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]], + index=index, + columns=["Column 1", "Unnamed: 2", "Column 3"], + ) + + result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0) + + tm.assert_frame_equal(result, expected) + + +def test_read_newlines_between_xml_elements_table(): + # GH#45598 + expected = pd.DataFrame( + [[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]], + columns=["Column 1", "Column 2", "Column 3"], + ) + + result = pd.read_excel("test_newlines.ods") + + tm.assert_frame_equal(result, expected) + + +def test_read_unempty_cells(): + expected = pd.DataFrame( + [1, np.nan, 3, np.nan, 5], + columns=["Column 1"], + ) + + result = pd.read_excel("test_unempty_cells.ods") + + tm.assert_frame_equal(result, expected) + + +def test_read_cell_annotation(): + expected = pd.DataFrame( + ["test", np.nan, "test 3"], + columns=["Column 1"], + ) + + result = pd.read_excel("test_cell_annotation.ods") + + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odswriter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odswriter.py new file mode 100644 index 0000000000000000000000000000000000000000..1c728ad801bc139c1ca1cd2e902884a5a2c91ffc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_odswriter.py @@ -0,0 +1,106 @@ +from datetime import ( + date, + datetime, +) +import re + +import pytest + +from pandas.compat import is_platform_windows + +import pandas as pd +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter + +odf = pytest.importorskip("odf") + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture +def ext(): + return ".ods" + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with odf!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="odf", mode="a") + + +@pytest.mark.parametrize("engine_kwargs", [None, {"kwarg": 1}]) +def test_engine_kwargs(ext, engine_kwargs): + # GH 42286 + # GH 43445 + # test for error: OpenDocumentSpreadsheet does not accept any arguments + with tm.ensure_clean(ext) as f: + if engine_kwargs is not None: + error = re.escape( + "OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'" + ) + with pytest.raises( + TypeError, + match=error, + ): + ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) + else: + with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _: + pass + + +def test_book_and_sheets_consistent(ext): + # GH#45687 - Ensure sheets is updated if user modifies book + with tm.ensure_clean(ext) as f: + with ExcelWriter(f) as writer: + assert writer.sheets == {} + table = odf.table.Table(name="test_name") + writer.book.spreadsheet.addElement(table) + assert writer.sheets == {"test_name": table} + + +@pytest.mark.parametrize( + ["value", "cell_value_type", "cell_value_attribute", "cell_value"], + argvalues=[ + (True, "boolean", "boolean-value", "true"), + ("test string", "string", "string-value", "test string"), + (1, "float", "value", "1"), + (1.5, "float", "value", "1.5"), + ( + datetime(2010, 10, 10, 10, 10, 10), + "date", + "date-value", + "2010-10-10T10:10:10", + ), + (date(2010, 10, 10), "date", "date-value", "2010-10-10"), + ], +) +def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): + # GH#54994 ODS: cell attributes should follow specification + # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 + from odf.namespaces import OFFICENS + from odf.table import ( + TableCell, + TableRow, + ) + + table_cell_name = TableCell().qname + + with tm.ensure_clean(ext) as f: + pd.DataFrame([[value]]).to_excel(f, header=False, index=False) + + with pd.ExcelFile(f) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_openpyxl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_openpyxl.py new file mode 100644 index 0000000000000000000000000000000000000000..e53b5830ec6a4b315165f4896aed27bdaadfbda6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_openpyxl.py @@ -0,0 +1,432 @@ +import contextlib +from pathlib import Path +import re + +import numpy as np +import pytest + +from pandas.compat import is_platform_windows + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.excel import ( + ExcelWriter, + _OpenpyxlWriter, +) +from pandas.io.excel._openpyxl import OpenpyxlReader + +openpyxl = pytest.importorskip("openpyxl") + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture +def ext(): + return ".xlsx" + + +def test_to_excel_styleconverter(): + from openpyxl import styles + + hstyle = { + "font": {"color": "00FF0000", "bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + "fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}}, + "number_format": {"format_code": "0.00"}, + "protection": {"locked": True, "hidden": False}, + } + + font_color = styles.Color("00FF0000") + font = styles.Font(bold=True, color=font_color) + side = styles.Side(style=styles.borders.BORDER_THIN) + border = styles.Border(top=side, right=side, bottom=side, left=side) + alignment = styles.Alignment(horizontal="center", vertical="top") + fill_color = styles.Color(rgb="006666FF", tint=0.3) + fill = styles.PatternFill(patternType="solid", fgColor=fill_color) + + number_format = "0.00" + + protection = styles.Protection(locked=True, hidden=False) + + kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) + assert kw["font"] == font + assert kw["border"] == border + assert kw["alignment"] == alignment + assert kw["fill"] == fill + assert kw["number_format"] == number_format + assert kw["protection"] == protection + + +def test_write_cells_merge_styled(ext): + from pandas.io.formats.excel import ExcelCell + + sheet_name = "merge_styled" + + sty_b1 = {"font": {"color": "00FF0000"}} + sty_a2 = {"font": {"color": "0000FF00"}} + + initial_cells = [ + ExcelCell(col=1, row=0, val=42, style=sty_b1), + ExcelCell(col=0, row=1, val=99, style=sty_a2), + ] + + sty_merged = {"font": {"color": "000000FF", "bold": True}} + sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) + openpyxl_sty_merged = sty_kwargs["font"] + merge_cells = [ + ExcelCell( + col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged + ) + ] + + with tm.ensure_clean(ext) as path: + with _OpenpyxlWriter(path) as writer: + writer._write_cells(initial_cells, sheet_name=sheet_name) + writer._write_cells(merge_cells, sheet_name=sheet_name) + + wks = writer.sheets[sheet_name] + xcell_b1 = wks["B1"] + xcell_a2 = wks["A2"] + assert xcell_b1.font == openpyxl_sty_merged + assert xcell_a2.font == openpyxl_sty_merged + + +@pytest.mark.parametrize("iso_dates", [True, False]) +def test_engine_kwargs_write(ext, iso_dates): + # GH 42286 GH 43445 + engine_kwargs = {"iso_dates": iso_dates} + with tm.ensure_clean(ext) as f: + with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer: + assert writer.book.iso_dates == iso_dates + # ExcelWriter won't allow us to close without writing something + DataFrame().to_excel(writer) + + +def test_engine_kwargs_append_invalid(ext): + # GH 43445 + # test whether an invalid engine kwargs actually raises + with tm.ensure_clean(ext) as f: + DataFrame(["hello", "world"]).to_excel(f) + with pytest.raises( + TypeError, + match=re.escape( + "load_workbook() got an unexpected keyword argument 'apple_banana'" + ), + ): + with ExcelWriter( + f, engine="openpyxl", mode="a", engine_kwargs={"apple_banana": "fruit"} + ) as writer: + # ExcelWriter needs us to write something to close properly + DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2") + + +@pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")]) +def test_engine_kwargs_append_data_only(ext, data_only, expected): + # GH 43445 + # tests whether the data_only engine_kwarg actually works well for + # openpyxl's load_workbook + with tm.ensure_clean(ext) as f: + DataFrame(["=1+1"]).to_excel(f) + with ExcelWriter( + f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only} + ) as writer: + assert writer.sheets["Sheet1"]["B2"].value == expected + # ExcelWriter needs us to writer something to close properly? + DataFrame().to_excel(writer, sheet_name="Sheet2") + + # ensure that data_only also works for reading + # and that formulas/values roundtrip + assert ( + pd.read_excel( + f, + sheet_name="Sheet1", + engine="openpyxl", + engine_kwargs={"data_only": data_only}, + ).iloc[0, 1] + == expected + ) + + +@pytest.mark.parametrize("kwarg_name", ["read_only", "data_only"]) +@pytest.mark.parametrize("kwarg_value", [True, False]) +def test_engine_kwargs_append_reader(datapath, ext, kwarg_name, kwarg_value): + # GH 55027 + # test that `read_only` and `data_only` can be passed to + # `openpyxl.reader.excel.load_workbook` via `engine_kwargs` + filename = datapath("io", "data", "excel", "test1" + ext) + with contextlib.closing( + OpenpyxlReader(filename, engine_kwargs={kwarg_name: kwarg_value}) + ) as reader: + assert getattr(reader.book, kwarg_name) == kwarg_value + + +@pytest.mark.parametrize( + "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] +) +def test_write_append_mode(ext, mode, expected): + df = DataFrame([1], columns=["baz"]) + + with tm.ensure_clean(ext) as f: + wb = openpyxl.Workbook() + wb.worksheets[0].title = "foo" + wb.worksheets[0]["A1"].value = "foo" + wb.create_sheet("bar") + wb.worksheets[1]["A1"].value = "bar" + wb.save(f) + + with ExcelWriter(f, engine="openpyxl", mode=mode) as writer: + df.to_excel(writer, sheet_name="baz", index=False) + + with contextlib.closing(openpyxl.load_workbook(f)) as wb2: + result = [sheet.title for sheet in wb2.worksheets] + assert result == expected + + for index, cell_value in enumerate(expected): + assert wb2.worksheets[index]["A1"].value == cell_value + + +@pytest.mark.parametrize( + "if_sheet_exists,num_sheets,expected", + [ + ("new", 2, ["apple", "banana"]), + ("replace", 1, ["pear"]), + ("overlay", 1, ["pear", "banana"]), + ], +) +def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected): + # GH 40230 + df1 = DataFrame({"fruit": ["apple", "banana"]}) + df2 = DataFrame({"fruit": ["pear"]}) + + with tm.ensure_clean(ext) as f: + df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False) + with ExcelWriter( + f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df2.to_excel(writer, sheet_name="foo", index=False) + + with contextlib.closing(openpyxl.load_workbook(f)) as wb: + assert len(wb.sheetnames) == num_sheets + assert wb.sheetnames[0] == "foo" + result = pd.read_excel(wb, "foo", engine="openpyxl") + assert list(result["fruit"]) == expected + if len(wb.sheetnames) == 2: + result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl") + tm.assert_frame_equal(result, df2) + + +@pytest.mark.parametrize( + "startrow, startcol, greeting, goodbye", + [ + (0, 0, ["poop", "world"], ["goodbye", "people"]), + (0, 1, ["hello", "world"], ["poop", "people"]), + (1, 0, ["hello", "poop"], ["goodbye", "people"]), + (1, 1, ["hello", "world"], ["goodbye", "poop"]), + ], +) +def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goodbye): + df1 = DataFrame({"greeting": ["hello", "world"], "goodbye": ["goodbye", "people"]}) + df2 = DataFrame(["poop"]) + + with tm.ensure_clean(ext) as f: + df1.to_excel(f, engine="openpyxl", sheet_name="poo", index=False) + with ExcelWriter( + f, engine="openpyxl", mode="a", if_sheet_exists="overlay" + ) as writer: + # use startrow+1 because we don't have a header + df2.to_excel( + writer, + index=False, + header=False, + startrow=startrow + 1, + startcol=startcol, + sheet_name="poo", + ) + + result = pd.read_excel(f, sheet_name="poo", engine="openpyxl") + expected = DataFrame({"greeting": greeting, "goodbye": goodbye}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "if_sheet_exists,msg", + [ + ( + "invalid", + "'invalid' is not valid for if_sheet_exists. Valid options " + "are 'error', 'new', 'replace' and 'overlay'.", + ), + ( + "error", + "Sheet 'foo' already exists and if_sheet_exists is set to 'error'.", + ), + ( + None, + "Sheet 'foo' already exists and if_sheet_exists is set to 'error'.", + ), + ], +) +def test_if_sheet_exists_raises(ext, if_sheet_exists, msg): + # GH 40230 + df = DataFrame({"fruit": ["pear"]}) + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=re.escape(msg)): + df.to_excel(f, sheet_name="foo", engine="openpyxl") + with ExcelWriter( + f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df.to_excel(writer, sheet_name="foo") + + +def test_to_excel_with_openpyxl_engine(ext): + # GH 29854 + with tm.ensure_clean(ext) as filename: + df1 = DataFrame({"A": np.linspace(1, 10, 10)}) + df2 = DataFrame({"B": np.linspace(1, 20, 10)}) + df = pd.concat([df1, df2], axis=1) + styled = df.style.map( + lambda val: f"color: {'red' if val < 0 else 'black'}" + ).highlight_max() + + styled.to_excel(filename, engine="openpyxl") + + +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + with contextlib.closing( + openpyxl.load_workbook(filename, read_only=read_only) + ) as wb: + result = pd.read_excel(wb, engine="openpyxl") + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only +): + # GH 38956, 39001 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") + if read_only is None: + result = pd.read_excel(path, header=header) + else: + with contextlib.closing( + openpyxl.load_workbook(path, read_only=read_only) + ) as wb: + result = pd.read_excel(wb, engine="openpyxl", header=header) + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +def test_append_mode_file(ext): + # GH 39576 + df = DataFrame() + + with tm.ensure_clean(ext) as f: + df.to_excel(f, engine="openpyxl") + + with ExcelWriter( + f, mode="a", engine="openpyxl", if_sheet_exists="new" + ) as writer: + df.to_excel(writer) + + # make sure that zip files are not concatenated by making sure that + # "docProps/app.xml" only occurs twice in the file + data = Path(f).read_bytes() + first = data.find(b"docProps/app.xml") + second = data.find(b"docProps/app.xml", first + 1) + third = data.find(b"docProps/app.xml", second + 1) + assert second != -1 and third == -1 + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_empty_trailing_rows(datapath, ext, read_only): + # GH 39181 + path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + with contextlib.closing( + openpyxl.load_workbook(path, read_only=read_only) + ) as wb: + result = pd.read_excel(wb, engine="openpyxl") + expected = DataFrame( + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + } + ) + tm.assert_frame_equal(result, expected) + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_empty_with_blank_row(datapath, ext, read_only): + # GH 39547 - empty excel file with a row that has no data + path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + with contextlib.closing( + openpyxl.load_workbook(path, read_only=read_only) + ) as wb: + result = pd.read_excel(wb, engine="openpyxl") + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + +def test_book_and_sheets_consistent(ext): + # GH#45687 - Ensure sheets is updated if user modifies book + with tm.ensure_clean(ext) as f: + with ExcelWriter(f, engine="openpyxl") as writer: + assert writer.sheets == {} + sheet = writer.book.create_sheet("test_name", 0) + assert writer.sheets == {"test_name": sheet} + + +def test_ints_spelled_with_decimals(datapath, ext): + # GH 46988 - openpyxl returns this sheet with floats + path = datapath("io", "data", "excel", f"ints_spelled_with_decimals{ext}") + result = pd.read_excel(path) + expected = DataFrame(range(2, 12), columns=[1]) + tm.assert_frame_equal(result, expected) + + +def test_read_multiindex_header_no_index_names(datapath, ext): + # GH#47487 + path = datapath("io", "data", "excel", f"multiindex_no_index_names{ext}") + result = pd.read_excel(path, index_col=[0, 1, 2], header=[0, 1, 2]) + expected = DataFrame( + [[np.nan, "x", "x", "x"], ["x", np.nan, np.nan, np.nan]], + columns=pd.MultiIndex.from_tuples( + [("X", "Y", "A1"), ("X", "Y", "A2"), ("XX", "YY", "B1"), ("XX", "YY", "B2")] + ), + index=pd.MultiIndex.from_tuples([("A", "AA", "AAA"), ("A", "BB", "BBB")]), + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_readers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_readers.py new file mode 100644 index 0000000000000000000000000000000000000000..8da8535952dcf98481716a2b00863dbfe6354af5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_readers.py @@ -0,0 +1,1751 @@ +from __future__ import annotations + +from datetime import ( + datetime, + time, +) +from functools import partial +from io import BytesIO +import os +from pathlib import Path +import platform +import re +from urllib.error import URLError +from zipfile import BadZipFile + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + read_csv, +) +import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + +read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] +engine_params = [ + # Add any engines to test here + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param( + "xlrd", + marks=[ + td.skip_if_no("xlrd"), + ], + ), + pytest.param( + "openpyxl", + marks=[ + td.skip_if_no("openpyxl"), + ], + ), + pytest.param( + None, + marks=[ + td.skip_if_no("xlrd"), + ], + ), + pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), + pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), +] + + +def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: + """ + Filter out invalid (engine, ext) pairs instead of skipping, as that + produces 500+ pytest.skips. + """ + engine = engine.values[0] + if engine == "openpyxl" and read_ext == ".xls": + return False + if engine == "odf" and read_ext != ".ods": + return False + if read_ext == ".ods" and engine not in {"odf", "calamine"}: + return False + if engine == "pyxlsb" and read_ext != ".xlsb": + return False + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: + return False + if engine == "xlrd" and read_ext != ".xls": + return False + return True + + +def _transfer_marks(engine, read_ext): + """ + engine gives us a pytest.param object with some marks, read_ext is just + a string. We need to generate a new pytest.param inheriting the marks. + """ + values = engine.values + (read_ext,) + new_param = pytest.param(values, marks=engine.marks) + return new_param + + +@pytest.fixture( + params=[ + _transfer_marks(eng, ext) + for eng in engine_params + for ext in read_ext_params + if _is_valid_engine_ext_pair(eng, ext) + ], + ids=str, +) +def engine_and_read_ext(request): + """ + Fixture for Excel reader engine and read_ext, only including valid pairs. + """ + return request.param + + +@pytest.fixture +def engine(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return engine + + +@pytest.fixture +def read_ext(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return read_ext + + +@pytest.fixture +def df_ref(datapath): + """ + Obtain the reference data from read_csv with the Python engine. + """ + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") + return df_ref + + +def get_exp_unit(read_ext: str, engine: str | None) -> str: + return "ns" + + +def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: + expected.index.name = None + unit = get_exp_unit(read_ext, engine) + # error: "Index" has no attribute "as_unit" + expected.index = expected.index.as_unit(unit) # type: ignore[attr-defined] + + +def xfail_datetimes_with_pyxlsb(engine, request): + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + + +class TestReaders: + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch): + """ + Change directory and set engine for read_excel calls. + """ + func = partial(pd.read_excel, engine=engine) + monkeypatch.chdir(datapath("io", "data", "excel")) + monkeypatch.setattr(pd, "read_excel", func) + + def test_engine_used(self, read_ext, engine, monkeypatch): + # GH 38884 + def parser(self, *args, **kwargs): + return self.engine + + monkeypatch.setattr(pd.ExcelFile, "parse", parser) + + expected_defaults = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xlsb": "pyxlsb", + "xls": "xlrd", + "ods": "odf", + } + + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + + if engine is not None: + expected = engine + else: + expected = expected_defaults[read_ext[1:]] + assert result == expected + + def test_engine_kwargs(self, read_ext, engine): + # GH#52214 + expected_defaults = { + "xlsx": {"foo": "abcd"}, + "xlsm": {"foo": 123}, + "xlsb": {"foo": "True"}, + "xls": {"foo": True}, + "ods": {"foo": "abcd"}, + } + + if engine in {"xlrd", "pyxlsb"}: + msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") + elif engine == "odf": + msg = re.escape(r"load() got an unexpected keyword argument 'foo'") + else: + msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") + + if engine is not None: + with pytest.raises(TypeError, match=msg): + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet1", + index_col=0, + engine_kwargs=expected_defaults[read_ext[1:]], + ) + + def test_usecols_int(self, read_ext): + # usecols as int + msg = "Passing an integer for `usecols`" + with pytest.raises(ValueError, match=msg): + pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 + ) + + # usecols as int + with pytest.raises(ValueError, match=msg): + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols=3, + ) + + def test_usecols_list(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext, engine) + + df1 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3] + ) + df2 = pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols=[0, 2, 3], + ) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) + + def test_usecols_str(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["A", "B", "C"]] + adjust_expected(expected, read_ext, engine) + + df2 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D" + ) + df3 = pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols="A:D", + ) + + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext, engine) + + df2 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D" + ) + df3 = pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols="A,C,D", + ) + # TODO add index to xls file + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + df2 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D" + ) + df3 = pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols="A,C:D", + ) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + @pytest.mark.parametrize( + "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] + ) + def test_usecols_diff_positional_int_columns_order( + self, request, engine, read_ext, usecols, df_ref + ): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["A", "C"]] + adjust_expected(expected, read_ext, engine) + + result = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) + def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): + expected = df_ref[["B", "D"]] + expected.index = range(len(expected)) + + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected) + + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) + + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) + tm.assert_frame_equal(result, expected) + + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["C", "D"]] + adjust_expected(expected, read_ext, engine) + + result = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E" + ) + tm.assert_frame_equal(result, expected) + + def test_usecols_excel_range_str_invalid(self, read_ext): + msg = "Invalid column name: E1" + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, read_ext): + msg = "list indices must be integers.*, not str" + + with pytest.raises(TypeError, match=msg): + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet1", + index_col=["A"], + usecols=["A", "C"], + ) + + def test_index_col_str(self, read_ext): + # see gh-52716 + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet3", index_col="A") + expected = DataFrame( + columns=["B", "C", "D", "E", "F"], index=Index([], name="A") + ) + tm.assert_frame_equal(result, expected) + + def test_index_col_empty(self, read_ext): + # see gh-9208 + result = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"] + ) + expected = DataFrame( + columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_col", [None, 2]) + def test_index_col_with_unnamed(self, read_ext, index_col): + # see gh-18792 + result = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet4", index_col=index_col + ) + expected = DataFrame( + [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"] + ) + if index_col: + expected = expected.set_index(expected.columns[index_col]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_pass_non_existent_column(self, read_ext): + msg = ( + "Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]" + ) + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, usecols=["E"]) + + def test_usecols_wrong_type(self, read_ext): + msg = ( + "'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable." + ) + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, usecols=["E1", 0]) + + def test_excel_stop_iterator(self, read_ext): + parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1") + expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) + tm.assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self, request, engine, read_ext): + xfail_datetimes_with_pyxlsb(engine, request) + + # https://github.com/tafia/calamine/issues/355 + if engine == "calamine" and read_ext == ".ods": + request.applymarker( + pytest.mark.xfail(reason="Calamine can't extract error from ods files") + ) + + parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") + expected = DataFrame([[np.nan]], columns=["Test"]) + tm.assert_frame_equal(parsed, expected) + + def test_excel_table(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) + + df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) + df2 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0 + ) + # TODO add index to file + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) + + df3 = pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1 + ) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_reader_special_dtypes(self, request, engine, read_ext): + xfail_datetimes_with_pyxlsb(engine, request) + + unit = get_exp_unit(read_ext, engine) + expected = DataFrame.from_dict( + { + "IntCol": [1, 2, -3, 4, 0], + "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005], + "BoolCol": [True, False, True, True, False], + "StrCol": [1, 2, 3, 4, 5], + "Str2Col": ["a", 3, "c", "d", "e"], + "DateCol": Index( + [ + datetime(2013, 10, 30), + datetime(2013, 10, 31), + datetime(1905, 1, 1), + datetime(2013, 12, 14), + datetime(2015, 3, 14), + ], + dtype=f"M8[{unit}]", + ), + }, + ) + basename = "test_types" + + # should read in correctly and infer types + actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 + actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = pd.read_excel( + basename + read_ext, sheet_name="Sheet1", index_col=icol + ) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + + expected["StrCol"] = expected["StrCol"].apply(str) + actual = pd.read_excel( + basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str} + ) + tm.assert_frame_equal(actual, expected) + + # GH8212 - support for converters and missing values + def test_reader_converters(self, read_ext): + basename = "test_converters" + + expected = DataFrame.from_dict( + { + "IntCol": [1, 2, -3, -1000, 0], + "FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005], + "BoolCol": ["Found", "Found", "Found", "Not found", "Found"], + "StrCol": ["1", np.nan, "3", "4", "5"], + } + ) + + converters = { + "IntCol": lambda x: int(x) if x != "" else -1000, + "FloatCol": lambda x: 10 * x if x else np.nan, + 2: lambda x: "Found" if x != "" else "Not found", + 3: lambda x: str(x) if x else "", + } + + # should read in correctly and set types of single cells (not array + # dtypes) + actual = pd.read_excel( + basename + read_ext, sheet_name="Sheet1", converters=converters + ) + tm.assert_frame_equal(actual, expected) + + def test_reader_dtype(self, read_ext): + # GH 8212 + basename = "testdtype" + actual = pd.read_excel(basename + read_ext) + + expected = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ) + + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str} + ) + + expected["a"] = expected["a"].astype("float64") + expected["b"] = expected["b"].astype("float32") + expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + tm.assert_frame_equal(actual, expected) + + msg = "Unable to convert column d to type int64" + with pytest.raises(ValueError, match=msg): + pd.read_excel(basename + read_ext, dtype={"d": "int64"}) + + @pytest.mark.parametrize( + "dtype,expected", + [ + ( + None, + DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ), + ), + ( + {"a": "float64", "b": "float32", "c": str, "d": str}, + DataFrame( + { + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": Series(["001", "002", "003", "004"], dtype=object), + "d": Series(["1", "2", np.nan, "4"], dtype=object), + } + ), + ), + ], + ) + def test_reader_dtype_str(self, read_ext, dtype, expected): + # see gh-20377 + basename = "testdtype" + + actual = pd.read_excel(basename + read_ext, dtype=dtype) + tm.assert_frame_equal(actual, expected) + + def test_dtype_backend(self, read_ext, dtype_backend, engine): + # GH#36712 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + + df = DataFrame( + { + "a": Series([1, 3], dtype="Int64"), + "b": Series([2.5, 4.5], dtype="Float64"), + "c": Series([True, False], dtype="boolean"), + "d": Series(["a", "b"], dtype="string"), + "e": Series([pd.NA, 6], dtype="Int64"), + "f": Series([pd.NA, 7.5], dtype="Float64"), + "g": Series([pd.NA, True], dtype="boolean"), + "h": Series([pd.NA, "a"], dtype="string"), + "i": Series([pd.Timestamp("2019-12-31")] * 2), + "j": Series([pd.NA, pd.NA], dtype="Int64"), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, sheet_name="test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", dtype_backend=dtype_backend + ) + if dtype_backend == "pyarrow": + import pyarrow as pa + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(df[col], from_pandas=True)) + for col in df.columns + } + ) + # pyarrow by default infers timestamp resolution as us, not ns + expected["i"] = ArrowExtensionArray( + expected["i"].array._pa_array.cast(pa.timestamp(unit="us")) + ) + # pyarrow supports a null type, so don't have to default to Int64 + expected["j"] = ArrowExtensionArray(pa.array([None, None])) + else: + expected = df + unit = get_exp_unit(read_ext, engine) + expected["i"] = expected["i"].astype(f"M8[{unit}]") + + tm.assert_frame_equal(result, expected) + + def test_dtype_backend_and_dtype(self, read_ext): + # GH#36712 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, sheet_name="test", index=False) + result = pd.read_excel( + file_path, + sheet_name="test", + dtype_backend="numpy_nullable", + dtype="float64", + ) + tm.assert_frame_equal(result, df) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="infer_string takes precedence" + ) + def test_dtype_backend_string(self, read_ext, string_storage): + # GH#36712 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + + pa = pytest.importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, sheet_name="test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", dtype_backend="numpy_nullable" + ) + + if string_storage == "python": + expected = DataFrame( + { + "a": StringArray(np.array(["a", "b"], dtype=np.object_)), + "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), + } + ) + else: + expected = DataFrame( + { + "a": ArrowStringArray(pa.array(["a", "b"])), + "b": ArrowStringArray(pa.array(["x", None])), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) + def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): + # GH#35211 + basename = "df_mangle_dup_col_dtypes" + dtype_dict = {"a": object, **dtypes} + dtype_dict_copy = dtype_dict.copy() + # GH#42462 + result = pd.read_excel(basename + read_ext, dtype=dtype_dict) + expected = DataFrame( + { + "a": Series([1], dtype=object), + "a.1": Series([exp_value], dtype=object if not dtypes else None), + } + ) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" + tm.assert_frame_equal(result, expected) + + def test_reader_spaces(self, read_ext): + # see gh-32207 + basename = "test_spaces" + + actual = pd.read_excel(basename + read_ext) + expected = DataFrame( + { + "testcol": [ + "this is great", + "4 spaces", + "1 trailing ", + " 1 leading", + "2 spaces multiple times", + ] + } + ) + tm.assert_frame_equal(actual, expected) + + # gh-36122, gh-35802 + @pytest.mark.parametrize( + "basename,expected", + [ + ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})), + ("gh-36122", DataFrame(columns=["got 2nd sa"])), + ], + ) + def test_read_excel_ods_nested_xml(self, engine, read_ext, basename, expected): + # see gh-35802 + if engine != "odf": + pytest.skip(f"Skipped for engine: {engine}") + + actual = pd.read_excel(basename + read_ext) + tm.assert_frame_equal(actual, expected) + + def test_reading_all_sheets(self, read_ext): + # Test reading all sheet names by setting sheet_name to None, + # Ensure a dict is returned. + # See PR #9450 + basename = "test_multisheet" + dfs = pd.read_excel(basename + read_ext, sheet_name=None) + # ensure this is not alphabetical to test order preservation + expected_keys = ["Charlie", "Alpha", "Beta"] + tm.assert_contains_all(expected_keys, dfs.keys()) + # Issue 9930 + # Ensure sheet order is preserved + assert expected_keys == list(dfs.keys()) + + def test_reading_multiple_specific_sheets(self, read_ext): + # Test reading specific sheet names by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + # Ensure a dict is returned + # See PR #9450 + basename = "test_multisheet" + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, "Charlie", "Charlie"] + dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys, dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_reading_all_sheets_with_blank(self, read_ext): + # Test reading all sheet names by setting sheet_name to None, + # In the case where some sheets are blank. + # Issue #11711 + basename = "blank_with_header" + dfs = pd.read_excel(basename + read_ext, sheet_name=None) + expected_keys = ["Sheet1", "Sheet2", "Sheet3"] + tm.assert_contains_all(expected_keys, dfs.keys()) + + # GH6403 + def test_read_excel_blank(self, read_ext): + actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, DataFrame()) + + def test_read_excel_blank_with_header(self, read_ext): + expected = DataFrame(columns=["col_1", "col_2"]) + actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, expected) + + def test_exception_message_includes_sheet_name(self, read_ext): + # GH 48706 + with pytest.raises(ValueError, match=r" \(sheet: Sheet1\)$"): + pd.read_excel("blank_with_header" + read_ext, header=[1], sheet_name=None) + with pytest.raises(ZeroDivisionError, match=r" \(sheet: Sheet1\)$"): + pd.read_excel("test1" + read_ext, usecols=lambda x: 1 / 0, sheet_name=None) + + @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") + def test_date_conversion_overflow(self, request, engine, read_ext): + # GH 10001 : pandas.ExcelFile ignore parse_dates=False + xfail_datetimes_with_pyxlsb(engine, request) + + expected = DataFrame( + [ + [pd.Timestamp("2016-03-12"), "Marc Johnson"], + [pd.Timestamp("2016-03-16"), "Jack Black"], + [1e20, "Timothy Brown"], + ], + columns=["DateColWithBigInt", "StringCol"], + ) + + if engine == "openpyxl": + request.applymarker( + pytest.mark.xfail(reason="Maybe not supported by openpyxl") + ) + + if engine is None and read_ext in (".xlsx", ".xlsm"): + # GH 35029 + request.applymarker( + pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported") + ) + + result = pd.read_excel("testdateoverflow" + read_ext) + tm.assert_frame_equal(result, expected) + + def test_sheet_name(self, request, read_ext, engine, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + filename = "test1" + sheet_name = "Sheet1" + + expected = df_ref + adjust_expected(expected, read_ext, engine) + + df1 = pd.read_excel( + filename + read_ext, sheet_name=sheet_name, index_col=0 + ) # doc + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) + + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) + + def test_excel_read_buffer(self, read_ext): + pth = "test1" + read_ext + expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0) + with open(pth, "rb") as f: + actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0) + tm.assert_frame_equal(expected, actual) + + def test_bad_engine_raises(self): + bad_engine = "foo" + with pytest.raises(ValueError, match="Unknown engine: foo"): + pd.read_excel("", engine=bad_engine) + + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel("blank" + read_ext, sheet_name=sheet_name) + + def test_missing_file_raises(self, read_ext): + bad_file = f"foo{read_ext}" + # CI tests with other languages, translates to "No such file or directory" + match = "|".join( + [ + "(No such file or directory", + "没有那个文件或目录", + "File o directory non esistente)", + ] + ) + with pytest.raises(FileNotFoundError, match=match): + pd.read_excel(bad_file) + + def test_corrupt_bytes_raises(self, engine): + bad_stream = b"foo" + if engine is None: + error = ValueError + msg = ( + "Excel file format cannot be determined, you must " + "specify an engine manually." + ) + elif engine == "xlrd": + from xlrd import XLRDError + + error = XLRDError + msg = ( + "Unsupported format, or corrupt file: Expected BOF " + "record; found b'foo'" + ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): + pd.read_excel(BytesIO(bad_stream)) + + @pytest.mark.network + @pytest.mark.single_cpu + def test_read_from_http_url(self, httpserver, read_ext): + with open("test1" + read_ext, "rb") as f: + httpserver.serve_content(content=f.read()) + url_table = pd.read_excel(httpserver.url) + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_not_us_locale + @pytest.mark.single_cpu + def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so): + # Bucket created in tests/io/conftest.py + with open("test1" + read_ext, "rb") as f: + s3_public_bucket.put_object(Key="test1" + read_ext, Body=f) + + url = f"s3://{s3_public_bucket.name}/test1" + read_ext + + url_table = pd.read_excel(url, storage_options=s3so) + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + + @pytest.mark.single_cpu + def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so): + # GH 38788 + # Bucket created in tests/io/conftest.py + with open("test1" + read_ext, "rb") as f: + s3_public_bucket.put_object(Key="test1" + read_ext, Body=f) + + import s3fs + + s3 = s3fs.S3FileSystem(**s3so) + + with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f: + url_table = pd.read_excel(f) + + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + + @pytest.mark.slow + def test_read_from_file_url(self, read_ext, datapath): + # FILE + localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) + local_table = pd.read_excel(localtable) + + try: + url_table = pd.read_excel("file://localhost/" + localtable) + except URLError: + # fails on some systems + platform_info = " ".join(platform.uname()).strip() + pytest.skip(f"failing on {platform_info}") + + tm.assert_frame_equal(url_table, local_table) + + def test_read_from_pathlib_path(self, read_ext): + # GH12655 + str_path = "test1" + read_ext + expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) + + path_obj = Path("test1" + read_ext) + actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no("py.path") + def test_read_from_py_localpath(self, read_ext): + # GH12655 + from py.path import local as LocalPath + + str_path = os.path.join("test1" + read_ext) + expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) + + path_obj = LocalPath().join("test1" + read_ext) + actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_close_from_py_localpath(self, read_ext): + # GH31467 + str_path = os.path.join("test1" + read_ext) + with open(str_path, "rb") as f: + x = pd.read_excel(f, sheet_name="Sheet1", index_col=0) + del x + # should not throw an exception because the passed file was closed + f.read() + + def test_reader_seconds(self, request, engine, read_ext): + xfail_datetimes_with_pyxlsb(engine, request) + + # GH 55045 + if engine == "calamine" and read_ext == ".ods": + request.applymarker( + pytest.mark.xfail( + reason="ODS file contains bad datetime (seconds as text)" + ) + ) + + # Test reading times with and without milliseconds. GH5945. + expected = DataFrame.from_dict( + { + "Time": [ + time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54), + ] + } + ) + + actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex(self, request, engine, read_ext): + # see gh-4679 + xfail_datetimes_with_pyxlsb(engine, request) + + unit = get_exp_unit(read_ext, engine) + + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = "testmultiindex" + read_ext + + # "mi_column" sheet + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + ) + expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]") + + actual = pd.read_excel( + mi_file, sheet_name="mi_column", header=[0, 1], index_col=0 + ) + tm.assert_frame_equal(actual, expected) + + # "mi_index" sheet + expected.index = mi + expected.columns = ["a", "b", "c", "d"] + + actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both" sheet + expected.columns = mi + + actual = pd.read_excel( + mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] + ) + tm.assert_frame_equal(actual, expected) + + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "mi_column_name" sheet + expected.index = list(range(4)) + expected.columns = mi.set_names(["c1", "c2"]) + actual = pd.read_excel( + mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 + ) + tm.assert_frame_equal(actual, expected) + + # see gh-11317 + # "name_with_int" sheet + expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) + + actual = pd.read_excel( + mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1] + ) + tm.assert_frame_equal(actual, expected) + + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = pd.read_excel( + mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1] + ) + tm.assert_frame_equal(actual, expected) + + # "both_skiprows" sheet + actual = pd.read_excel( + mi_file, + sheet_name="both_name_skiprows", + index_col=[0, 1], + header=[0, 1], + skiprows=2, + ) + tm.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize( + "sheet_name,idx_lvl2", + [ + ("both_name_blank_after_mi_name", [np.nan, "b", "a", "b"]), + ("both_name_multiple_blanks", [np.nan] * 4), + ], + ) + def test_read_excel_multiindex_blank_after_name( + self, request, engine, read_ext, sheet_name, idx_lvl2 + ): + # GH34673 + xfail_datetimes_with_pyxlsb(engine, request) + + mi_file = "testmultiindex" + read_ext + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) + + unit = get_exp_unit(read_ext, engine) + + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + index=MultiIndex.from_arrays( + (["foo", "foo", "bar", "bar"], idx_lvl2), + names=["ilvl1", "ilvl2"], + ), + ) + expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]") + result = pd.read_excel( + mi_file, + sheet_name=sheet_name, + index_col=[0, 1], + header=[0, 1], + ) + tm.assert_frame_equal(result, expected) + + def test_read_excel_multiindex_header_only(self, read_ext): + # see gh-11733. + # + # Don't try to parse a header name if there isn't one. + mi_file = "testmultiindex" + read_ext + result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1]) + + exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) + expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) + tm.assert_frame_equal(result, expected) + + def test_excel_old_index_format(self, read_ext): + # see gh-4679 + filename = "test_index_name_pre17" + read_ext + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array( + [ + [np.nan, np.nan, np.nan, np.nan, np.nan], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ], + dtype=object, + ) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex( + levels=[ + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None], + ) + si = Index( + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None + ) + + expected = DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(filename, sheet_name="single_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. + data = np.array( + [ + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex( + levels=[ + ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None], + ) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) + + expected = DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_bool_header_arg(self, read_ext): + # GH 6114 + msg = "Passing a bool to header is invalid" + for arg in [True, False]: + with pytest.raises(TypeError, match=msg): + pd.read_excel("test1" + read_ext, header=arg) + + def test_read_excel_skiprows(self, request, engine, read_ext): + # GH 4903 + xfail_datetimes_with_pyxlsb(engine, request) + + unit = get_exp_unit(read_ext, engine) + + actual = pd.read_excel( + "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=np.array([0, 2]), + ) + tm.assert_frame_equal(actual, expected) + + # GH36435 + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=lambda x: x in [0, 2], + ) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=3, + names=["a", "b", "c", "d"], + ) + expected = DataFrame( + [ + # [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") + tm.assert_frame_equal(actual, expected) + + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): + # GH 4903 + xfail_datetimes_with_pyxlsb(engine, request) + unit = get_exp_unit(read_ext, engine) + + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=lambda x: x not in [1, 3, 5], + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + # [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + # [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows(self, read_ext): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + expected = pd.read_excel("test1" + read_ext) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): + # GH 16645 + expected = pd.read_excel("test1" + read_ext) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self, read_ext): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, nrows="5") + + @pytest.mark.parametrize( + "filename,sheet_name,header,index_col,skiprows", + [ + ("testmultiindex", "mi_column", [0, 1], 0, None), + ("testmultiindex", "mi_index", None, [0, 1], None), + ("testmultiindex", "both", [0, 1], [0, 1], None), + ("testmultiindex", "mi_column_name", [0, 1], 0, None), + ("testskiprows", "skiprows_list", None, None, [0, 2]), + ("testskiprows", "skiprows_list", None, None, lambda x: x in (0, 2)), + ], + ) + def test_read_excel_nrows_params( + self, read_ext, filename, sheet_name, header, index_col, skiprows + ): + """ + For various parameters, we should get the same result whether we + limit the rows during load (nrows=3) or after (df.iloc[:3]). + """ + # GH 46894 + expected = pd.read_excel( + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, + ).iloc[:3] + actual = pd.read_excel( + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, + nrows=3, + ) + tm.assert_frame_equal(actual, expected) + + def test_deprecated_kwargs(self, read_ext): + with pytest.raises(TypeError, match="but 3 positional arguments"): + pd.read_excel("test1" + read_ext, "Sheet1", 0) + + def test_no_header_with_list_index_col(self, read_ext): + # GH 31783 + file_name = "testmultiindex" + read_ext + data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] + idx = MultiIndex.from_tuples( + [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1) + ) + expected = DataFrame(data, index=idx, columns=(2, 3)) + result = pd.read_excel( + file_name, sheet_name="index_col_none", index_col=[0, 1], header=None + ) + tm.assert_frame_equal(expected, result) + + def test_one_col_noskip_blank_line(self, read_ext): + # GH 39808 + file_name = "one_col_blank_line" + read_ext + data = [0.5, np.nan, 1, 2] + expected = DataFrame(data, columns=["numbers"]) + result = pd.read_excel(file_name) + tm.assert_frame_equal(result, expected) + + def test_multiheader_two_blank_lines(self, read_ext): + # GH 40442 + file_name = "testmultiindex" + read_ext + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] + expected = DataFrame(data, columns=columns) + result = pd.read_excel( + file_name, sheet_name="mi_column_empty_rows", header=[0, 1] + ) + tm.assert_frame_equal(result, expected) + + def test_trailing_blanks(self, read_ext): + """ + Sheets can contain blank cells with no data. Some of our readers + were including those cells, creating many empty rows and columns + """ + file_name = "trailing_blanks" + read_ext + result = pd.read_excel(file_name) + assert result.shape == (3, 3) + + def test_ignore_chartsheets_by_str(self, request, engine, read_ext): + # GH 41448 + if read_ext == ".ods": + pytest.skip("chartsheets do not exist in the ODF format") + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="pyxlsb can't distinguish chartsheets from worksheets" + ) + ) + with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"): + pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1") + + def test_ignore_chartsheets_by_int(self, request, engine, read_ext): + # GH 41448 + if read_ext == ".ods": + pytest.skip("chartsheets do not exist in the ODF format") + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="pyxlsb can't distinguish chartsheets from worksheets" + ) + ) + with pytest.raises( + ValueError, match="Worksheet index 1 is invalid, 1 worksheets found" + ): + pd.read_excel("chartsheet" + read_ext, sheet_name=1) + + def test_euro_decimal_format(self, read_ext): + # copied from read_csv + result = pd.read_excel("test_decimal" + read_ext, decimal=",", skiprows=1) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) + + +class TestExcelFileRead: + def test_deprecate_bytes_input(self, engine, read_ext): + # GH 53830 + msg = ( + "Passing bytes to 'read_excel' is deprecated and " + "will be removed in a future version. To read from a " + "byte string, wrap it in a `BytesIO` object." + ) + + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=False + ): + with open("test1" + read_ext, "rb") as f: + pd.read_excel(f.read(), engine=engine) + + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch): + """ + Change directory and set engine for ExcelFile objects. + """ + func = partial(pd.ExcelFile, engine=engine) + monkeypatch.chdir(datapath("io", "data", "excel")) + monkeypatch.setattr(pd, "ExcelFile", func) + + def test_engine_used(self, read_ext, engine): + expected_defaults = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xlsb": "pyxlsb", + "xls": "xlrd", + "ods": "odf", + } + + with pd.ExcelFile("test1" + read_ext) as excel: + result = excel.engine + + if engine is not None: + expected = engine + else: + expected = expected_defaults[read_ext[1:]] + assert result == expected + + def test_excel_passes_na(self, read_ext): + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + # 13967 + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("na_filter", [None, True, False]) + def test_excel_passes_na_filter(self, read_ext, na_filter): + # gh-25453 + kwargs = {} + + if na_filter is not None: + kwargs["na_filter"] = na_filter + + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, + sheet_name="Sheet1", + keep_default_na=True, + na_values=["apple"], + **kwargs, + ) + + if na_filter is False: + expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]] + else: + expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]] + + expected = DataFrame(expected, columns=["Test"]) + tm.assert_frame_equal(parsed, expected) + + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) + + with pd.ExcelFile("test1" + read_ext) as excel: + df1 = pd.read_excel(excel, sheet_name=0, index_col=0) + df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) + + with pd.ExcelFile("test1" + read_ext) as excel: + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) + + with pd.ExcelFile("test1" + read_ext) as excel: + df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + with pd.ExcelFile("test1" + read_ext) as excel: + df3 = excel.parse(0, index_col=0, skipfooter=1) + + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_sheet_name(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) + + filename = "test1" + sheet_name = "Sheet1" + + with pd.ExcelFile(filename + read_ext) as excel: + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + + with pd.ExcelFile(filename + read_ext) as excel: + df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) + + tm.assert_frame_equal(df1_parse, expected) + tm.assert_frame_equal(df2_parse, expected) + + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + with pd.ExcelFile("blank" + read_ext) as excel: + excel.parse(sheet_name=sheet_name) + + def test_excel_read_buffer(self, engine, read_ext): + pth = "test1" + read_ext + expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine) + + with open(pth, "rb") as f: + with pd.ExcelFile(f) as xls: + actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_reader_closes_file(self, engine, read_ext): + with open("test1" + read_ext, "rb") as f: + with pd.ExcelFile(f) as xlsx: + # parses okay + pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine) + + assert f.closed + + def test_conflicting_excel_engines(self, read_ext): + # GH 26566 + msg = "Engine should not be specified when passing an ExcelFile" + + with pd.ExcelFile("test1" + read_ext) as xl: + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, engine="foo") + + def test_excel_read_binary(self, engine, read_ext): + # GH 15914 + expected = pd.read_excel("test1" + read_ext, engine=engine) + + with open("test1" + read_ext, "rb") as f: + data = f.read() + + actual = pd.read_excel(BytesIO(data), engine=engine) + tm.assert_frame_equal(expected, actual) + + def test_excel_read_binary_via_read_excel(self, read_ext, engine): + # GH 38424 + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f, engine=engine) + expected = pd.read_excel("test1" + read_ext, engine=engine) + tm.assert_frame_equal(result, expected) + + def test_read_excel_header_index_out_of_range(self, engine): + # GH#43143 + with open("df_header_oob.xlsx", "rb") as f: + with pytest.raises(ValueError, match="exceeds maximum"): + pd.read_excel(f, header=[0, 1]) + + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) + def test_header_with_index_col(self, filename): + # GH 33476 + idx = Index(["Z"], name="I2") + cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) + expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64") + result = pd.read_excel( + filename, sheet_name="Sheet1", index_col=0, header=[0, 1] + ) + tm.assert_frame_equal(expected, result) + + def test_read_datetime_multiindex(self, request, engine, read_ext): + # GH 34748 + xfail_datetimes_with_pyxlsb(engine, request) + + f = "test_datetime_mi" + read_ext + with pd.ExcelFile(f) as excel: + actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) + + unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") + expected_column_index = MultiIndex.from_arrays( + [dti[:1], dti[1:]], + names=[ + dti[0].to_pydatetime(), + dti[1].to_pydatetime(), + ], + ) + expected = DataFrame([], index=[], columns=expected_column_index) + + tm.assert_frame_equal(expected, actual) + + def test_engine_invalid_option(self, read_ext): + # read_ext includes the '.' hence the weird formatting + with pytest.raises(ValueError, match="Value must be one of *"): + with pd.option_context(f"io.excel{read_ext}.reader", "abc"): + pass + + def test_ignore_chartsheets(self, request, engine, read_ext): + # GH 41448 + if read_ext == ".ods": + pytest.skip("chartsheets do not exist in the ODF format") + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="pyxlsb can't distinguish chartsheets from worksheets" + ) + ) + with pd.ExcelFile("chartsheet" + read_ext) as excel: + assert excel.sheet_names == ["Sheet1"] + + def test_corrupt_files_closed(self, engine, read_ext): + # GH41778 + errors = (BadZipFile,) + if engine is None: + pytest.skip(f"Invalid test for engine={engine}") + elif engine == "xlrd": + import xlrd + + errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) + + with tm.ensure_clean(f"corrupt{read_ext}") as file: + Path(file).write_text("corrupt", encoding="utf-8") + with tm.assert_produces_warning(False): + try: + pd.ExcelFile(file, engine=engine) + except errors: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_style.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_style.py new file mode 100644 index 0000000000000000000000000000000000000000..89615172688d7b56fbb070dbcd4750365d7d612d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_style.py @@ -0,0 +1,298 @@ +import contextlib +import time + +import numpy as np +import pytest + +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + read_excel, +) +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter +from pandas.io.formats.excel import ExcelFormatter + +pytest.importorskip("jinja2") +# jinja2 is currently required for Styler.__init__(). Technically Styler.to_excel +# could compute styles and render to excel without jinja2, since there is no +# 'template' file, but this needs the import error to delayed until render time. + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +def assert_equal_cell_styles(cell1, cell2): + # TODO: should find a better way to check equality + assert cell1.alignment.__dict__ == cell2.alignment.__dict__ + assert cell1.border.__dict__ == cell2.border.__dict__ + assert cell1.fill.__dict__ == cell2.fill.__dict__ + assert cell1.font.__dict__ == cell2.font.__dict__ + assert cell1.number_format == cell2.number_format + assert cell1.protection.__dict__ == cell2.protection.__dict__ + + +@pytest.mark.parametrize( + "engine", + ["xlsxwriter", "openpyxl"], +) +def test_styler_to_excel_unstyled(engine): + # compare DataFrame.to_excel and Styler.to_excel when no styles applied + pytest.importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + df.style.to_excel(writer, sheet_name="unstyled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + assert cell1.value == cell2.value + assert_equal_cell_styles(cell1, cell2) + + +shared_style_params = [ + ( + "background-color: #111222", + ["fill", "fgColor", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ( + "color: #111222", + ["font", "color", "value"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ("font-family: Arial;", ["font", "name"], "arial"), + ("font-weight: bold;", ["font", "b"], True), + ("font-style: italic;", ["font", "i"], True), + ("text-decoration: underline;", ["font", "u"], "single"), + ("number-format: $??,???.00;", ["number_format"], "$??,???.00"), + ("text-align: left;", ["alignment", "horizontal"], "left"), + ( + "vertical-align: bottom;", + ["alignment", "vertical"], + {"xlsxwriter": None, "openpyxl": "bottom"}, # xlsxwriter Fails + ), + ("vertical-align: middle;", ["alignment", "vertical"], "center"), + # Border widths + ("border-left: 2pt solid red", ["border", "left", "style"], "medium"), + ("border-left: 1pt dotted red", ["border", "left", "style"], "dotted"), + ("border-left: 2pt dotted red", ["border", "left", "style"], "mediumDashDotDot"), + ("border-left: 1pt dashed red", ["border", "left", "style"], "dashed"), + ("border-left: 2pt dashed red", ["border", "left", "style"], "mediumDashed"), + ("border-left: 1pt solid red", ["border", "left", "style"], "thin"), + ("border-left: 3pt solid red", ["border", "left", "style"], "thick"), + # Border expansion + ( + "border-left: 2pt solid #111222", + ["border", "left", "color", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ("border: 1pt solid red", ["border", "top", "style"], "thin"), + ( + "border: 1pt solid #111222", + ["border", "top", "color", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ("border: 1pt solid red", ["border", "right", "style"], "thin"), + ( + "border: 1pt solid #111222", + ["border", "right", "color", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ("border: 1pt solid red", ["border", "bottom", "style"], "thin"), + ( + "border: 1pt solid #111222", + ["border", "bottom", "color", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + ("border: 1pt solid red", ["border", "left", "style"], "thin"), + ( + "border: 1pt solid #111222", + ["border", "left", "color", "rgb"], + {"xlsxwriter": "FF111222", "openpyxl": "00111222"}, + ), + # Border styles + ( + "border-left-style: hair; border-left-color: black", + ["border", "left", "style"], + "hair", + ), +] + + +@pytest.mark.parametrize( + "engine", + ["xlsxwriter", "openpyxl"], +) +@pytest.mark.parametrize("css, attrs, expected", shared_style_params) +def test_styler_to_excel_basic(engine, css, attrs, expected): + pytest.importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + styler = df.style.map(lambda x: css) + + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles + u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) + for attr in attrs: + u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) + + if isinstance(expected, dict): + assert u_cell is None or u_cell != expected[engine] + assert s_cell == expected[engine] + else: + assert u_cell is None or u_cell != expected + assert s_cell == expected + + +@pytest.mark.parametrize( + "engine", + ["xlsxwriter", "openpyxl"], +) +@pytest.mark.parametrize("css, attrs, expected", shared_style_params) +def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): + pytest.importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + + styler = df.style + styler.map_index(lambda x: css, axis=0) + styler.map_index(lambda x: css, axis=1) + + null_styler = df.style + null_styler.map(lambda x: "null: css;") + null_styler.map_index(lambda x: "null: css;", axis=0) + null_styler.map_index(lambda x: "null: css;", axis=1) + + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine=engine) as writer: + null_styler.to_excel(writer, sheet_name="null_styled") + styler.to_excel(writer, sheet_name="styled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test null styled index cells does not have expected styles + # test styled cell has expected styles + ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1) + uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2) + for attr in attrs: + ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr) + uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr) + + if isinstance(expected, dict): + assert ui_cell is None or ui_cell != expected[engine] + assert si_cell == expected[engine] + assert uc_cell is None or uc_cell != expected[engine] + assert sc_cell == expected[engine] + else: + assert ui_cell is None or ui_cell != expected + assert si_cell == expected + assert uc_cell is None or uc_cell != expected + assert sc_cell == expected + + +# From https://openpyxl.readthedocs.io/en/stable/api/openpyxl.styles.borders.html +# Note: Leaving behavior of "width"-type styles undefined; user should use border-width +# instead +excel_border_styles = [ + # "thin", + "dashed", + "mediumDashDot", + "dashDotDot", + "hair", + "dotted", + "mediumDashDotDot", + # "medium", + "double", + "dashDot", + "slantDashDot", + # "thick", + "mediumDashed", +] + + +@pytest.mark.parametrize( + "engine", + ["xlsxwriter", "openpyxl"], +) +@pytest.mark.parametrize("border_style", excel_border_styles) +def test_styler_to_excel_border_style(engine, border_style): + css = f"border-left: {border_style} black thin" + attrs = ["border", "left", "style"] + expected = border_style + + pytest.importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + styler = df.style.map(lambda x: css) + + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles + u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) + for attr in attrs: + u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) + + if isinstance(expected, dict): + assert u_cell is None or u_cell != expected[engine] + assert s_cell == expected[engine] + else: + assert u_cell is None or u_cell != expected + assert s_cell == expected + + +def test_styler_custom_converter(): + openpyxl = pytest.importorskip("openpyxl") + + def custom_converter(css): + return {"font": {"color": {"rgb": "111222"}}} + + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + styler = df.style.map(lambda x: "color: #888999") + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine="openpyxl") as writer: + ExcelFormatter(styler, style_converter=custom_converter).write( + writer, sheet_name="custom" + ) + + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + assert wb["custom"].cell(2, 2).font.color.value == "00111222" + + +@pytest.mark.single_cpu +@td.skip_if_not_us_locale +def test_styler_to_s3(s3_public_bucket, s3so): + # GH#46381 + + mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx" + df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + styler = df.style.set_sticky(axis="index") + styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) + timeout = 5 + while True: + if target_file in (obj.key for obj in s3_public_bucket.objects.all()): + break + time.sleep(0.1) + timeout -= 0.1 + assert timeout > 0, "Timed out waiting for file to appear on moto" + result = read_excel( + f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so + ) + tm.assert_frame_equal(result, df) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_writers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_writers.py new file mode 100644 index 0000000000000000000000000000000000000000..292eab2d881526e6816d85f1fcd38aaa35255243 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_writers.py @@ -0,0 +1,1511 @@ +from datetime import ( + date, + datetime, + timedelta, +) +from functools import partial +from io import BytesIO +import os +import re + +import numpy as np +import pytest + +from pandas.compat import is_platform_windows +from pandas.compat._constants import PY310 +from pandas.compat._optional import import_optional_dependency +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + date_range, + option_context, +) +import pandas._testing as tm + +from pandas.io.excel import ( + ExcelFile, + ExcelWriter, + _OpenpyxlWriter, + _XlsxWriter, + register_writer, +) +from pandas.io.excel._util import _writers + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +def get_exp_unit(path: str) -> str: + return "ns" + + +@pytest.fixture +def frame(float_frame): + """ + Returns the first ten items in fixture "float_frame". + """ + return float_frame[:10] + + +@pytest.fixture(params=[True, False]) +def merge_cells(request): + return request.param + + +@pytest.fixture +def path(ext): + """ + Fixture to open file for use in each test case. + """ + with tm.ensure_clean(ext) as file_path: + yield file_path + + +@pytest.fixture +def set_engine(engine, ext): + """ + Fixture to set engine for use in each test case. + + Rather than requiring `engine=...` to be provided explicitly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + """ + option_name = f"io.excel.{ext.strip('.')}.writer" + with option_context(option_name, engine): + yield + + +@pytest.mark.parametrize( + "ext", + [ + pytest.param(".xlsx", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param(".xlsm", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param( + ".xlsx", marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")] + ), + pytest.param(".ods", marks=td.skip_if_no("odf")), + ], +) +class TestRoundTrip: + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], + ) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) + + with tm.ensure_clean(ext) as path: + df.to_excel(path, sheet_name=filename, index=False, header=False) + result = pd.read_excel( + path, sheet_name=filename, usecols=[0], header=header + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], + ) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) + + with tm.ensure_clean(ext) as path: + df.to_excel(path, sheet_name="with_header", index=False, header=True) + result = pd.read_excel( + path, sheet_name=filename, usecols=[0], header=header + ) + + tm.assert_frame_equal(result, expected) + + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) + + with tm.ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel( + writer, sheet_name="Data_no_head", header=False, index=False + ) + refdf.to_excel(writer, sheet_name="Data_with_head", index=False) + + refdf.columns = ["A", "B"] + + with ExcelFile(pth) as reader: + xlsdf_no_head = pd.read_excel( + reader, sheet_name="Data_no_head", header=None, names=["A", "B"] + ) + xlsdf_with_head = pd.read_excel( + reader, + sheet_name="Data_with_head", + index_col=None, + names=["A", "B"], + ) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with tm.ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheet_name=sheetname) + + dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with tm.ensure_clean(ext) as path: + df = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0}, + } + ) + + expected = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0}, + } + ) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + { + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + expected = DataFrame( + { + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("c_idx_names", ["a", None]) + @pytest.mark.parametrize("r_idx_names", ["b", None]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip( + self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request + ): + # see gh-4679 + with tm.ensure_clean(ext) as pth: + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = bool(r_idx_names) or r_idx_levels <= 1 + + if c_idx_levels == 1: + columns = Index(list("abcde")) + else: + columns = MultiIndex.from_arrays( + [range(5) for _ in range(c_idx_levels)], + names=[f"{c_idx_names}-{i}" for i in range(c_idx_levels)], + ) + if r_idx_levels == 1: + index = Index(list("ghijk")) + else: + index = MultiIndex.from_arrays( + [range(5) for _ in range(r_idx_levels)], + names=[f"{r_idx_names}-{i}" for i in range(r_idx_levels)], + ) + df = DataFrame( + 1.1 * np.ones((5, 5)), + columns=columns, + index=index, + ) + df.to_excel(pth) + + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], "date_strings": date_range("2012-01-01", periods=3)} + ) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with tm.ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = pd.read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") + with tm.assert_produces_warning( + FutureWarning, + match="use 'date_format' instead", + raise_on_extra_warnings=False, + ): + res = pd.read_excel( + pth, + parse_dates=["date_strings"], + date_parser=date_parser, + index_col=0, + ) + tm.assert_frame_equal(df, res) + res = pd.read_excel( + pth, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 + ) + tm.assert_frame_equal(df, res) + + def test_multiindex_interval_datetimes(self, ext): + # GH 30986 + midx = MultiIndex.from_arrays( + [ + range(4), + pd.interval_range( + start=pd.Timestamp("2020-01-01"), periods=4, freq="6ME" + ), + ] + ) + df = DataFrame(range(4), index=midx) + with tm.ensure_clean(ext) as pth: + df.to_excel(pth) + result = pd.read_excel(pth, index_col=[0, 1]) + expected = DataFrame( + range(4), + MultiIndex.from_arrays( + [ + range(4), + [ + "(2020-01-31 00:00:00, 2020-07-31 00:00:00]", + "(2020-07-31 00:00:00, 2021-01-31 00:00:00]", + "(2021-01-31 00:00:00, 2021-07-31 00:00:00]", + "(2021-07-31 00:00:00, 2022-01-31 00:00:00]", + ], + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "engine,ext", + [ + pytest.param( + "openpyxl", + ".xlsx", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "openpyxl", + ".xlsm", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "xlsxwriter", + ".xlsx", + marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")], + ), + pytest.param("odf", ".ods", marks=td.skip_if_no("odf")), + ], +) +@pytest.mark.usefixtures("set_engine") +class TestExcelWriter: + def test_excel_sheet_size(self, path): + # GH 26080 + breaking_row_count = 2**20 + 1 + breaking_col_count = 2**14 + 1 + # purposely using two arrays to prevent memory issues while testing + row_arr = np.zeros(shape=(breaking_row_count, 1)) + col_arr = np.zeros(shape=(1, breaking_col_count)) + row_df = DataFrame(row_arr) + col_df = DataFrame(col_arr) + + msg = "sheet is too large" + with pytest.raises(ValueError, match=msg): + row_df.to_excel(path) + + with pytest.raises(ValueError, match=msg): + col_df.to_excel(path) + + def test_excel_sheet_by_name_raise(self, path): + gt = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) + gt.to_excel(path) + + with ExcelFile(path) as xl: + df = pd.read_excel(xl, sheet_name=0, index_col=0) + + tm.assert_frame_equal(gt, df) + + msg = "Worksheet named '0' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, "0") + + def test_excel_writer_context_manager(self, frame, path): + with ExcelWriter(path) as writer: + frame.to_excel(writer, sheet_name="Data1") + frame2 = frame.copy() + frame2.columns = frame.columns[::-1] + frame2.to_excel(writer, sheet_name="Data2") + + with ExcelFile(path) as reader: + found_df = pd.read_excel(reader, sheet_name="Data1", index_col=0) + found_df2 = pd.read_excel(reader, sheet_name="Data2", index_col=0) + + tm.assert_frame_equal(found_df, frame) + tm.assert_frame_equal(found_df2, frame2) + + def test_roundtrip(self, frame, path): + frame = frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) + + # test roundtrip + frame.to_excel(path, sheet_name="test1") + recons = pd.read_excel(path, sheet_name="test1", index_col=0) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, sheet_name="test1", index=False) + recons = pd.read_excel(path, sheet_name="test1", index_col=None) + recons.index = frame.index + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, sheet_name="test1", na_rep="NA") + recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["NA"]) + tm.assert_frame_equal(frame, recons) + + # GH 3611 + frame.to_excel(path, sheet_name="test1", na_rep="88") + recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["88"]) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, sheet_name="test1", na_rep="88") + recons = pd.read_excel( + path, sheet_name="test1", index_col=0, na_values=[88, 88.0] + ) + tm.assert_frame_equal(frame, recons) + + # GH 6573 + frame.to_excel(path, sheet_name="Sheet1") + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, sheet_name="0") + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(frame, recons) + + # GH 8825 Pandas Series should provide to_excel method + s = frame["A"] + s.to_excel(path) + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(s.to_frame(), recons) + + def test_mixed(self, frame, path): + mixed_frame = frame.copy() + mixed_frame["foo"] = "bar" + + mixed_frame.to_excel(path, sheet_name="test1") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(mixed_frame, recons) + + def test_ts_frame(self, path): + unit = get_exp_unit(path) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + + # freq doesn't round-trip + index = pd.DatetimeIndex(np.asarray(df.index), freq=None) + df.index = index + + expected = df[:] + expected.index = expected.index.as_unit(unit) + + df.to_excel(path, sheet_name="test1") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_basics_with_nan(self, frame, path): + frame = frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) + + @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) + def test_int_types(self, np_type, path): + # Test np.int values read come back as int + # (rather than float which is Excel's format). + df = DataFrame( + np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type + ) + df.to_excel(path, sheet_name="test1") + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + + int_frame = df.astype(np.int64) + tm.assert_frame_equal(int_frame, recons) + + recons2 = pd.read_excel(path, sheet_name="test1", index_col=0) + tm.assert_frame_equal(int_frame, recons2) + + @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) + def test_float_types(self, np_type, path): + # Test np.float values read come back as float. + df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) + df.to_excel(path, sheet_name="test1") + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( + np_type + ) + + tm.assert_frame_equal(df, recons) + + def test_bool_types(self, path): + # Test np.bool_ values read come back as float. + df = DataFrame([1, 0, True, False], dtype=np.bool_) + df.to_excel(path, sheet_name="test1") + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( + np.bool_ + ) + + tm.assert_frame_equal(df, recons) + + def test_inf_roundtrip(self, path): + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df.to_excel(path, sheet_name="test1") + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + + tm.assert_frame_equal(df, recons) + + def test_sheets(self, frame, path): + # freq doesn't round-trip + unit = get_exp_unit(path) + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) + tsframe.index = index + + expected = tsframe[:] + expected.index = expected.index.as_unit(unit) + + frame = frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) + + # Test writing to separate sheets + with ExcelWriter(path) as writer: + frame.to_excel(writer, sheet_name="test1") + tsframe.to_excel(writer, sheet_name="test2") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(frame, recons) + recons = pd.read_excel(reader, sheet_name="test2", index_col=0) + tm.assert_frame_equal(expected, recons) + assert 2 == len(reader.sheet_names) + assert "test1" == reader.sheet_names[0] + assert "test2" == reader.sheet_names[1] + + def test_colaliases(self, frame, path): + frame = frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + frame.to_excel(path, sheet_name="test1", header=col_aliases) + with ExcelFile(path) as reader: + rs = pd.read_excel(reader, sheet_name="test1", index_col=0) + xp = frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + def test_roundtrip_indexlabels(self, merge_cells, frame, path): + frame = frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) + + # test index_label + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df.to_excel( + path, sheet_name="test1", index_label=["test"], merge_cells=merge_cells + ) + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( + np.int64 + ) + df.index.names = ["test"] + assert df.index.names == recons.index.names + + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df.to_excel( + path, + sheet_name="test1", + index_label=["test", "dummy", "dummy2"], + merge_cells=merge_cells, + ) + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( + np.int64 + ) + df.index.names = ["test"] + assert df.index.names == recons.index.names + + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df.to_excel( + path, sheet_name="test1", index_label="test", merge_cells=merge_cells + ) + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( + np.int64 + ) + df.index.names = ["test"] + tm.assert_frame_equal(df, recons.astype(bool)) + + frame.to_excel( + path, + sheet_name="test1", + columns=["A", "B", "C", "D"], + index=False, + merge_cells=merge_cells, + ) + # take 'A' and 'B' as indexes (same row as cols 'C', 'D') + df = frame.copy() + df = df.set_index(["A", "B"]) + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) + tm.assert_frame_equal(df, recons) + + def test_excel_roundtrip_indexname(self, merge_cells, path): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + df.index.name = "foo" + + df.to_excel(path, merge_cells=merge_cells) + + with ExcelFile(path) as xf: + result = pd.read_excel(xf, sheet_name=xf.sheet_names[0], index_col=0) + + tm.assert_frame_equal(result, df) + assert result.index.name == "foo" + + def test_excel_roundtrip_datetime(self, merge_cells, path): + # datetime.date, not sure what to test here exactly + unit = get_exp_unit(path) + + # freq does not round-trip + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) + tsframe.index = index + + tsf = tsframe.copy() + + tsf.index = [x.date() for x in tsframe.index] + tsf.to_excel(path, sheet_name="test1", merge_cells=merge_cells) + + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + + expected = tsframe[:] + expected.index = expected.index.as_unit(unit) + tm.assert_frame_equal(expected, recons) + + def test_excel_date_datetime_format(self, ext, path): + # see gh-4133 + # + # Excel output format strings + unit = get_exp_unit(path) + + df = DataFrame( + [ + [date(2014, 1, 31), date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + df_expected = DataFrame( + [ + [datetime(2014, 1, 31), datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + df_expected = df_expected.astype(f"M8[{unit}]") + + with tm.ensure_clean(ext) as filename2: + with ExcelWriter(path) as writer1: + df.to_excel(writer1, sheet_name="test1") + + with ExcelWriter( + filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS", + ) as writer2: + df.to_excel(writer2, sheet_name="test1") + + with ExcelFile(path) as reader1: + rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) + + with ExcelFile(filename2) as reader2: + rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + + tm.assert_frame_equal(rs1, rs2) + + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. + tm.assert_frame_equal(rs2, df_expected) + + def test_to_excel_interval_no_labels(self, path, using_infer_string): + # see gh-19242 + # + # Test writing Interval without labels. + df = DataFrame( + np.random.default_rng(2).integers(-10, 10, size=(20, 1)), dtype=np.int64 + ) + expected = df.copy() + + df["new"] = pd.cut(df[0], 10) + expected["new"] = pd.cut(expected[0], 10).astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) + + df.to_excel(path, sheet_name="test1") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_interval_labels(self, path): + # see gh-19242 + # + # Test writing Interval with labels. + df = DataFrame( + np.random.default_rng(2).integers(-10, 10, size=(20, 1)), dtype=np.int64 + ) + expected = df.copy() + intervals = pd.cut( + df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] + ) + df["new"] = intervals + expected["new"] = pd.Series(list(intervals)) + + df.to_excel(path, sheet_name="test1") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_timedelta(self, path): + # see gh-19242, gh-9155 + # + # Test writing timedelta to xls. + df = DataFrame( + np.random.default_rng(2).integers(-10, 10, size=(20, 1)), + columns=["A"], + dtype=np.int64, + ) + expected = df.copy() + + df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) + expected["new"] = expected["A"].apply( + lambda x: timedelta(seconds=x).total_seconds() / 86400 + ) + + df.to_excel(path, sheet_name="test1") + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_periodindex(self, path): + # xp has a PeriodIndex + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + xp = df.resample("ME").mean().to_period("M") + + xp.to_excel(path, sheet_name="sht1") + + with ExcelFile(path) as reader: + rs = pd.read_excel(reader, sheet_name="sht1", index_col=0) + tm.assert_frame_equal(xp, rs.to_period("M")) + + def test_to_excel_multiindex(self, merge_cells, frame, path): + arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + + # round trip + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) + with ExcelFile(path) as reader: + df = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + + # GH13511 + def test_to_excel_multiindex_nan_label(self, merge_cells, path): + df = DataFrame( + { + "A": [None, 2, 3], + "B": [10, 20, 30], + "C": np.random.default_rng(2).random(3), + } + ) + df = df.set_index(["A", "B"]) + + df.to_excel(path, merge_cells=merge_cells) + df1 = pd.read_excel(path, index_col=[0, 1]) + tm.assert_frame_equal(df, df1) + + # Test for Issue 11328. If column indices are integers, make + # sure they are handled correctly for either setting of + # merge_cells + def test_to_excel_multiindex_cols(self, merge_cells, frame, path): + arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) + frame.columns = new_cols_index + header = [0, 1] + if not merge_cells: + header = 0 + + # round trip + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) + with ExcelFile(path) as reader: + df = pd.read_excel( + reader, sheet_name="test1", header=header, index_col=[0, 1] + ) + if not merge_cells: + fm = frame.columns._format_multi(sparsify=False, include_names=False) + frame.columns = [".".join(map(str, q)) for q in zip(*fm)] + tm.assert_frame_equal(frame, df) + + def test_to_excel_multiindex_dates(self, merge_cells, path): + # try multiindex with dates + unit = get_exp_unit(path) + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + tsframe.index = MultiIndex.from_arrays( + [ + tsframe.index.as_unit(unit), + np.arange(len(tsframe.index), dtype=np.int64), + ], + names=["time", "foo"], + ) + + tsframe.to_excel(path, sheet_name="test1", merge_cells=merge_cells) + with ExcelFile(path) as reader: + recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) + + tm.assert_frame_equal(tsframe, recons) + assert recons.index.names == ("time", "foo") + + def test_to_excel_multiindex_no_write_index(self, path): + # Test writing and re-reading a MI without the index. GH 5616. + + # Initial non-MI frame. + frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) + + # Add a MI. + frame2 = frame1.copy() + multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) + frame2.index = multi_index + + # Write out to Excel without the index. + frame2.to_excel(path, sheet_name="test1", index=False) + + # Read it back in. + with ExcelFile(path) as reader: + frame3 = pd.read_excel(reader, sheet_name="test1") + + # Test that it is the same as the initial frame. + tm.assert_frame_equal(frame1, frame3) + + def test_to_excel_empty_multiindex(self, path): + # GH 19543. + expected = DataFrame([], columns=[0, 1, 2]) + + df = DataFrame([], index=MultiIndex.from_tuples([], names=[0, 1]), columns=[2]) + df.to_excel(path, sheet_name="test1") + + with ExcelFile(path) as reader: + result = pd.read_excel(reader, sheet_name="test1") + tm.assert_frame_equal( + result, expected, check_index_type=False, check_dtype=False + ) + + def test_to_excel_float_format(self, path): + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(path, sheet_name="test1", float_format="%.2f") + + with ExcelFile(path) as reader: + result = pd.read_excel(reader, sheet_name="test1", index_col=0) + + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) + + def test_to_excel_output_encoding(self, ext): + # Avoid mixed inferred_type. + df = DataFrame( + [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], + index=["A\u0192", "B"], + columns=["X\u0193", "Y", "Z"], + ) + + with tm.ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: + df.to_excel(filename, sheet_name="TestSheet") + result = pd.read_excel(filename, sheet_name="TestSheet", index_col=0) + tm.assert_frame_equal(result, df) + + def test_to_excel_unicode_filename(self, ext): + with tm.ensure_clean("\u0192u." + ext) as filename: + try: + with open(filename, "wb"): + pass + except UnicodeEncodeError: + pytest.skip("No unicode file names on this system") + + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(filename, sheet_name="test1", float_format="%.2f") + + with ExcelFile(filename) as reader: + result = pd.read_excel(reader, sheet_name="test1", index_col=0) + + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("use_headers", [True, False]) + @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) + @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) + def test_excel_010_hemstring( + self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path + ): + def roundtrip(data, header=True, parser_hdr=0, index=True): + data.to_excel(path, header=header, merge_cells=merge_cells, index=index) + + with ExcelFile(path) as xf: + return pd.read_excel( + xf, sheet_name=xf.sheet_names[0], header=parser_hdr + ) + + # Basic test. + parser_header = 0 if use_headers else None + res = roundtrip(DataFrame([0]), use_headers, parser_header) + + assert res.shape == (1, 2) + assert res.iloc[0, 0] is not np.nan + + # More complex tests with multi-index. + nrows = 5 + ncols = 3 + + # ensure limited functionality in 0.10 + # override of gh-2370 until sorted out in 0.11 + + if c_idx_nlevels == 1: + columns = Index([f"a-{i}" for i in range(ncols)], dtype=object) + else: + columns = MultiIndex.from_arrays( + [range(ncols) for _ in range(c_idx_nlevels)], + names=[f"i-{i}" for i in range(c_idx_nlevels)], + ) + if r_idx_nlevels == 1: + index = Index([f"b-{i}" for i in range(nrows)], dtype=object) + else: + index = MultiIndex.from_arrays( + [range(nrows) for _ in range(r_idx_nlevels)], + names=[f"j-{i}" for i in range(r_idx_nlevels)], + ) + + df = DataFrame( + np.ones((nrows, ncols)), + columns=columns, + index=index, + ) + + # This if will be removed once multi-column Excel writing + # is implemented. For now fixing gh-9794. + if c_idx_nlevels > 1: + msg = ( + "Writing to Excel with MultiIndex columns and no index " + "\\('index'=False\\) is not yet implemented." + ) + with pytest.raises(NotImplementedError, match=msg): + roundtrip(df, use_headers, index=False) + else: + res = roundtrip(df, use_headers) + + if use_headers: + assert res.shape == (nrows, ncols + r_idx_nlevels) + else: + # First row taken as columns. + assert res.shape == (nrows - 1, ncols + r_idx_nlevels) + + # No NaNs. + for r in range(len(res.index)): + for c in range(len(res.columns)): + assert res.iloc[r, c] is not np.nan + + def test_duplicated_columns(self, path): + # see gh-5235 + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) + df.to_excel(path, sheet_name="test1") + expected = DataFrame( + [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] + ) + + # By default, we mangle. + result = pd.read_excel(path, sheet_name="test1", index_col=0) + tm.assert_frame_equal(result, expected) + + # see gh-11007, gh-10970 + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) + df.to_excel(path, sheet_name="test1") + + result = pd.read_excel(path, sheet_name="test1", index_col=0) + expected = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] + ) + tm.assert_frame_equal(result, expected) + + # see gh-10982 + df.to_excel(path, sheet_name="test1", index=False, header=False) + result = pd.read_excel(path, sheet_name="test1", header=None) + + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + tm.assert_frame_equal(result, expected) + + def test_swapped_columns(self, path): + # Test for issue #5427. + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + write_frame.to_excel(path, sheet_name="test1", columns=["B", "A"]) + + read_frame = pd.read_excel(path, sheet_name="test1", header=0) + + tm.assert_series_equal(write_frame["A"], read_frame["A"]) + tm.assert_series_equal(write_frame["B"], read_frame["B"]) + + def test_invalid_columns(self, path): + # see gh-10982 + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + + with pytest.raises(KeyError, match="Not all names specified"): + write_frame.to_excel(path, sheet_name="test1", columns=["B", "C"]) + + with pytest.raises( + KeyError, match="'passes columns are not ALL present dataframe'" + ): + write_frame.to_excel(path, sheet_name="test1", columns=["C", "D"]) + + @pytest.mark.parametrize( + "to_excel_index,read_excel_index_col", + [ + (True, 0), # Include index in write to file + (False, None), # Dont include index in write to file + ], + ) + def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): + # GH 31677 + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) + write_frame.to_excel( + path, sheet_name="col_subset_bug", columns=["A", "B"], index=to_excel_index + ) + + expected = write_frame[["A", "B"]] + read_frame = pd.read_excel( + path, sheet_name="col_subset_bug", index_col=read_excel_index_col + ) + + tm.assert_frame_equal(expected, read_frame) + + def test_comment_arg(self, path): + # see gh-18735 + # + # Test the comment argument functionality to pd.read_excel. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, sheet_name="test_c") + + # Read file without comment arg. + result1 = pd.read_excel(path, sheet_name="test_c", index_col=0) + + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + + result2 = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) + tm.assert_frame_equal(result1, result2) + + def test_comment_default(self, path): + # Re issue #18735 + # Test the comment argument default to pd.read_excel + + # Create file to read in + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, sheet_name="test_c") + + # Read file with default and explicit comment=None + result1 = pd.read_excel(path, sheet_name="test_c") + result2 = pd.read_excel(path, sheet_name="test_c", comment=None) + tm.assert_frame_equal(result1, result2) + + def test_comment_used(self, path): + # see gh-18735 + # + # Test the comment argument is working as expected when used. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, sheet_name="test_c") + + # Test read_frame_comment against manually produced expected output. + expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) + result = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) + tm.assert_frame_equal(result, expected) + + def test_comment_empty_line(self, path): + # Re issue #18735 + # Test that pd.read_excel ignores commented lines at the end of file + + df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) + df.to_excel(path, index=False) + + # Test that all-comment lines at EoF are ignored + expected = DataFrame({"a": [1], "b": [2]}) + result = pd.read_excel(path, comment="#") + tm.assert_frame_equal(result, expected) + + def test_datetimes(self, path): + # Test writing and reading datetimes. For issue #9139. (xref #9185) + unit = get_exp_unit(path) + datetimes = [ + datetime(2013, 1, 13, 1, 2, 3), + datetime(2013, 1, 13, 2, 45, 56), + datetime(2013, 1, 13, 4, 29, 49), + datetime(2013, 1, 13, 6, 13, 42), + datetime(2013, 1, 13, 7, 57, 35), + datetime(2013, 1, 13, 9, 41, 28), + datetime(2013, 1, 13, 11, 25, 21), + datetime(2013, 1, 13, 13, 9, 14), + datetime(2013, 1, 13, 14, 53, 7), + datetime(2013, 1, 13, 16, 37, 0), + datetime(2013, 1, 13, 18, 20, 52), + ] + + write_frame = DataFrame({"A": datetimes}) + write_frame.to_excel(path, sheet_name="Sheet1") + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) + + expected = write_frame.astype(f"M8[{unit}]") + tm.assert_series_equal(expected["A"], read_frame["A"]) + + def test_bytes_io(self, engine): + # see gh-7074 + with BytesIO() as bio: + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) + + # Pass engine explicitly, as there is no file path to infer from. + with ExcelWriter(bio, engine=engine) as writer: + df.to_excel(writer) + + bio.seek(0) + reread_df = pd.read_excel(bio, index_col=0) + tm.assert_frame_equal(df, reread_df) + + def test_engine_kwargs(self, engine, path): + # GH#52368 + df = DataFrame([{"A": 1, "B": 2}, {"A": 3, "B": 4}]) + + msgs = { + "odf": r"OpenDocumentSpreadsheet() got an unexpected keyword " + r"argument 'foo'", + "openpyxl": r"__init__() got an unexpected keyword argument 'foo'", + "xlsxwriter": r"__init__() got an unexpected keyword argument 'foo'", + } + + if PY310: + msgs[ + "openpyxl" + ] = "Workbook.__init__() got an unexpected keyword argument 'foo'" + msgs[ + "xlsxwriter" + ] = "Workbook.__init__() got an unexpected keyword argument 'foo'" + + # Handle change in error message for openpyxl (write and append mode) + if engine == "openpyxl" and not os.path.exists(path): + msgs[ + "openpyxl" + ] = r"load_workbook() got an unexpected keyword argument 'foo'" + + with pytest.raises(TypeError, match=re.escape(msgs[engine])): + df.to_excel( + path, + engine=engine, + engine_kwargs={"foo": "bar"}, + ) + + def test_write_lists_dict(self, path): + # see gh-8188. + df = DataFrame( + { + "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"], + } + ) + df.to_excel(path, sheet_name="Sheet1") + read = pd.read_excel(path, sheet_name="Sheet1", header=0, index_col=0) + + expected = df.copy() + expected.mixed = expected.mixed.apply(str) + expected.numeric = expected.numeric.astype("int64") + + tm.assert_frame_equal(read, expected) + + def test_render_as_column_name(self, path): + # see gh-34331 + df = DataFrame({"render": [1, 2], "data": [3, 4]}) + df.to_excel(path, sheet_name="Sheet1") + read = pd.read_excel(path, "Sheet1", index_col=0) + expected = df + tm.assert_frame_equal(read, expected) + + def test_true_and_false_value_options(self, path): + # see gh-13347 + df = DataFrame([["foo", "bar"]], columns=["col1", "col2"], dtype=object) + with option_context("future.no_silent_downcasting", True): + expected = df.replace({"foo": True, "bar": False}).astype("bool") + + df.to_excel(path) + read_frame = pd.read_excel( + path, true_values=["foo"], false_values=["bar"], index_col=0 + ) + tm.assert_frame_equal(read_frame, expected) + + def test_freeze_panes(self, path): + # see gh-15160 + expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) + expected.to_excel(path, sheet_name="Sheet1", freeze_panes=(1, 1)) + + result = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(result, expected) + + def test_path_path_lib(self, engine, ext): + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") + tm.assert_frame_equal(result, df) + + def test_path_local_path(self, engine, ext): + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}") + tm.assert_frame_equal(result, df) + + def test_merged_cell_custom_objects(self, path): + # see GH-27006 + mi = MultiIndex.from_tuples( + [ + (pd.Period("2018"), pd.Period("2018Q1")), + (pd.Period("2018"), pd.Period("2018Q2")), + ] + ) + expected = DataFrame(np.ones((2, 2), dtype="int64"), columns=mi) + expected.to_excel(path) + result = pd.read_excel(path, header=[0, 1], index_col=0) + # need to convert PeriodIndexes to standard Indexes for assert equal + expected.columns = expected.columns.set_levels( + [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], + level=[0, 1], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): + # GH 27008, GH 7056 + tz = tz_aware_fixture + data = pd.Timestamp("2019", tz=tz) + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(path) + + data = data.to_pydatetime() + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(path) + + def test_excel_duplicate_columns_with_names(self, path): + # GH#39695 + df = DataFrame({"A": [0, 1], "B": [10, 11]}) + df.to_excel(path, columns=["A", "B", "A"], index=False) + + result = pd.read_excel(path) + expected = DataFrame([[0, 10, 0], [1, 11, 1]], columns=["A", "B", "A.1"]) + tm.assert_frame_equal(result, expected) + + def test_if_sheet_exists_raises(self, ext): + # GH 40230 + msg = "if_sheet_exists is only valid in append mode (mode='a')" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=re.escape(msg)): + ExcelWriter(f, if_sheet_exists="replace") + + def test_excel_writer_empty_frame(self, engine, ext): + # GH#45793 + with tm.ensure_clean(ext) as path: + with ExcelWriter(path, engine=engine) as writer: + DataFrame().to_excel(writer) + result = pd.read_excel(path) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + def test_to_excel_empty_frame(self, engine, ext): + # GH#45793 + with tm.ensure_clean(ext) as path: + DataFrame().to_excel(path, engine=engine) + result = pd.read_excel(path) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + +class TestExcelWriterEngineTests: + @pytest.mark.parametrize( + "klass,ext", + [ + pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), + ], + ) + def test_ExcelWriter_dispatch(self, klass, ext): + with tm.ensure_clean(ext) as path: + with ExcelWriter(path) as writer: + if ext == ".xlsx" and bool( + import_optional_dependency("xlsxwriter", errors="ignore") + ): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) + + def test_ExcelWriter_dispatch_raises(self): + with pytest.raises(ValueError, match="No engine"): + ExcelWriter("nothing") + + def test_register_writer(self): + class DummyClass(ExcelWriter): + called_save = False + called_write_cells = False + called_sheets = False + _supported_extensions = ("xlsx", "xls") + _engine = "dummy" + + def book(self): + pass + + def _save(self): + type(self).called_save = True + + def _write_cells(self, *args, **kwargs): + type(self).called_write_cells = True + + @property + def sheets(self): + type(self).called_sheets = True + + @classmethod + def assert_called_and_reset(cls): + assert cls.called_save + assert cls.called_write_cells + assert not cls.called_sheets + cls.called_save = False + cls.called_write_cells = False + + register_writer(DummyClass) + + with option_context("io.excel.xlsx.writer", "dummy"): + path = "something.xlsx" + with tm.ensure_clean(path) as filepath: + with ExcelWriter(filepath) as writer: + assert isinstance(writer, DummyClass) + df = DataFrame( + ["a"], + columns=Index(["b"], name="foo"), + index=Index(["c"], name="bar"), + ) + df.to_excel(filepath) + DummyClass.assert_called_and_reset() + + with tm.ensure_clean("something.xls") as filepath: + df.to_excel(filepath, engine="dummy") + DummyClass.assert_called_and_reset() + + +@td.skip_if_no("xlrd") +@td.skip_if_no("openpyxl") +class TestFSPath: + def test_excelfile_fspath(self): + with tm.ensure_clean("foo.xlsx") as path: + df = DataFrame({"A": [1, 2]}) + df.to_excel(path) + with ExcelFile(path) as xl: + result = os.fspath(xl) + assert result == path + + def test_excelwriter_fspath(self): + with tm.ensure_clean("foo.xlsx") as path: + with ExcelWriter(path) as writer: + assert os.fspath(writer) == str(path) + + def test_to_excel_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_excel except " + r"for the argument 'excel_writer' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = BytesIO() + writer = ExcelWriter(buf) + df.to_excel(writer, "Sheet_name_1") + + +@pytest.mark.parametrize("klass", _writers.values()) +def test_subclass_attr(klass): + # testing that subclasses of ExcelWriter don't have public attributes (issue 49602) + attrs_base = {name for name in dir(ExcelWriter) if not name.startswith("_")} + attrs_klass = {name for name in dir(klass) if not name.startswith("_")} + assert not attrs_base.symmetric_difference(attrs_klass) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlrd.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlrd.py new file mode 100644 index 0000000000000000000000000000000000000000..066393d91eeadcdc08873f4ffeedda0f689337fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlrd.py @@ -0,0 +1,76 @@ +import io + +import numpy as np +import pytest + +from pandas.compat import is_platform_windows + +import pandas as pd +import pandas._testing as tm + +from pandas.io.excel import ExcelFile +from pandas.io.excel._base import inspect_excel_format + +xlrd = pytest.importorskip("xlrd") + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture(params=[".xls"]) +def read_ext_xlrd(request): + """ + Valid extensions for reading Excel files with xlrd. + + Similar to read_ext, but excludes .ods, .xlsb, and for xlrd>2 .xlsx, .xlsm + """ + return request.param + + +def test_read_xlrd_book(read_ext_xlrd, datapath): + engine = "xlrd" + sheet_name = "Sheet1" + pth = datapath("io", "data", "excel", "test1.xls") + with xlrd.open_workbook(pth) as book: + with ExcelFile(book, engine=engine) as xl: + result = pd.read_excel(xl, sheet_name=sheet_name, index_col=0) + + expected = pd.read_excel( + book, sheet_name=sheet_name, engine=engine, index_col=0 + ) + tm.assert_frame_equal(result, expected) + + +def test_read_xlsx_fails(datapath): + # GH 29375 + from xlrd.biffh import XLRDError + + path = datapath("io", "data", "excel", "test1.xlsx") + with pytest.raises(XLRDError, match="Excel xlsx file; not supported"): + pd.read_excel(path, engine="xlrd") + + +def test_nan_in_xls(datapath): + # GH 54564 + path = datapath("io", "data", "excel", "test6.xls") + + expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]}) + + result = pd.read_excel(path, header=None) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "file_header", + [ + b"\x09\x00\x04\x00\x07\x00\x10\x00", + b"\x09\x02\x06\x00\x00\x00\x10\x00", + b"\x09\x04\x06\x00\x00\x00\x10\x00", + b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", + ], +) +def test_read_old_xls_files(file_header): + # GH 41226 + f = io.BytesIO(file_header) + assert inspect_excel_format(f) == "xls" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlsxwriter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlsxwriter.py new file mode 100644 index 0000000000000000000000000000000000000000..529367761fc025e3e5d02bea85741c82f64c97ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/test_xlsxwriter.py @@ -0,0 +1,86 @@ +import contextlib + +import pytest + +from pandas.compat import is_platform_windows + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter + +xlsxwriter = pytest.importorskip("xlsxwriter") + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture +def ext(): + return ".xlsx" + + +def test_column_format(ext): + # Test that column formats are applied to cells. Test for issue #9167. + # Applicable to xlsxwriter only. + openpyxl = pytest.importorskip("openpyxl") + + with tm.ensure_clean(ext) as path: + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) + + with ExcelWriter(path) as writer: + frame.to_excel(writer) + + # Add a number format to col B and ensure it is applied to cells. + num_format = "#,##0" + write_workbook = writer.book + write_worksheet = write_workbook.worksheets()[0] + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) + + with contextlib.closing(openpyxl.load_workbook(path)) as read_workbook: + try: + read_worksheet = read_workbook["Sheet1"] + except TypeError: + # compat + read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") + + # Get the number format from the cell. + try: + cell = read_worksheet["B2"] + except TypeError: + # compat + cell = read_worksheet.cell("B2") + + try: + read_num_format = cell.number_format + except AttributeError: + read_num_format = cell.style.number_format._format_code + + assert read_num_format == num_format + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with xlsxwriter!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="xlsxwriter", mode="a") + + +@pytest.mark.parametrize("nan_inf_to_errors", [True, False]) +def test_engine_kwargs(ext, nan_inf_to_errors): + # GH 42286 + engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} + with tm.ensure_clean(ext) as f: + with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer: + assert writer.book.nan_inf_to_errors == nan_inf_to_errors + + +def test_book_and_sheets_consistent(ext): + # GH#45687 - Ensure sheets is updated if user modifies book + with tm.ensure_clean(ext) as f: + with ExcelWriter(f, engine="xlsxwriter") as writer: + assert writer.sheets == {} + sheet = writer.book.add_worksheet("test_name") + assert writer.sheets == {"test_name": sheet} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4e69a5e91ec681b500050a76495b9556bfc7d8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_console.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_console.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc941331127ee458492bfc5bd0e20e8fdd2741e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_console.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_css.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_css.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..487eca3628b58389ed7b61e1f1af016efc39e0a4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_css.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_eng_formatting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_eng_formatting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9a52e108219ea5ded7b1f6c3ead0cf34a098424 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_eng_formatting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_ipython_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_ipython_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a52d5f3f26986b37dfcaf332037febdd9963237e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_ipython_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_printing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_printing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2e352fd9d5d0cc184841f4a3fffc1b82be91437 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_printing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_csv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_csv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7764a1c40927c0b5ad11e66e24163a8098c44ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_csv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_excel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_excel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2feef0bf31fb6ae70bf7b656838705a8077e3cf4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_excel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_html.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_html.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0302c86483ef7054c734eef2a12ab392efd14268 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_html.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_latex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_latex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91fc16959a23300668df37821998e49fd8205d14 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_latex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_markdown.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_markdown.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2f1777b6143bf4e40254c4dc8dbd6fd94c25aa4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_markdown.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_string.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_string.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..655a3c249620e1c50892a8ed4bd173937f94793e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/__pycache__/test_to_string.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13532684ebd569a8fae4591c1038590e7c85bc63 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_bar.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_bar.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51dc58c91cf22b3dd86b2ca751e192218453ab0e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_bar.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_exceptions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_exceptions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f99d6ffcee23ac2de4fff1e610410ce86242f4fc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_exceptions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_format.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_format.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c65ca62836bf751361b912eb9f750ecc70e4196 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_format.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_highlight.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_highlight.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e09b89ae09a1cd2ace176d63cf566d209d68e71f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_highlight.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_html.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_html.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d6c3679e581dccaa2c4739a88f347a32f33431a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_html.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_matplotlib.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_matplotlib.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25ade1252f7c43c8b61c35098b2e08ef73d79171 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_matplotlib.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_non_unique.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_non_unique.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06a1ab0afcaefb0e30a101e36c41cc86ce309c5d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_non_unique.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_style.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_style.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4d118de09f633b57b871d653ac99a0d98a5a530 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_style.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_latex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_latex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c8cac1bc3cc1a2f381e65507f75de697410c39c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_latex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_string.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_string.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb310530fe530ccc91b255c2eb4c8b86b0635756 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_to_string.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_tooltip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_tooltip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56325ae0dcb6c878d3805bf060b4bcd04becfca6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/__pycache__/test_tooltip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_bar.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_bar.py new file mode 100644 index 0000000000000000000000000000000000000000..b0e4712e8bb3d15959bddc0bd8697981b16bd8ef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_bar.py @@ -0,0 +1,358 @@ +import io + +import numpy as np +import pytest + +from pandas import ( + NA, + DataFrame, + read_csv, +) + +pytest.importorskip("jinja2") + + +def bar_grad(a=None, b=None, c=None, d=None): + """Used in multiple tests to simplify formatting of expected result""" + ret = [("width", "10em")] + if all(x is None for x in [a, b, c, d]): + return ret + return ret + [ + ( + "background", + f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})", + ) + ] + + +def no_bar(): + return bar_grad() + + +def bar_to(x, color="#d65f5f"): + return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%") + + +def bar_from_to(x, y, color="#d65f5f"): + return bar_grad( + f" transparent {x:.1f}%", + f" {color} {x:.1f}%", + f" {color} {y:.1f}%", + f" transparent {y:.1f}%", + ) + + +@pytest.fixture +def df_pos(): + return DataFrame([[1], [2], [3]]) + + +@pytest.fixture +def df_neg(): + return DataFrame([[-1], [-2], [-3]]) + + +@pytest.fixture +def df_mix(): + return DataFrame([[-3], [1], [2]]) + + +@pytest.mark.parametrize( + "align, exp", + [ + ("left", [no_bar(), bar_to(50), bar_to(100)]), + ("right", [bar_to(100), bar_from_to(50, 100), no_bar()]), + ("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]), + ("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]), + ("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]), + (2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]), + (np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]), + ], +) +def test_align_positive_cases(df_pos, align, exp): + # test different align cases for all positive values + result = df_pos.style.bar(align=align)._compute().ctx + expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} + assert result == expected + + +@pytest.mark.parametrize( + "align, exp", + [ + ("left", [bar_to(100), bar_to(50), no_bar()]), + ("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]), + ("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]), + ("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]), + ("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]), + (-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]), + (np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]), + ], +) +def test_align_negative_cases(df_neg, align, exp): + # test different align cases for all negative values + result = df_neg.style.bar(align=align)._compute().ctx + expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} + assert result == expected + + +@pytest.mark.parametrize( + "align, exp", + [ + ("left", [no_bar(), bar_to(80), bar_to(100)]), + ("right", [bar_to(100), bar_from_to(80, 100), no_bar()]), + ("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]), + ("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), + ("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), + (-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]), + (np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]), + ], +) +@pytest.mark.parametrize("nans", [True, False]) +def test_align_mixed_cases(df_mix, align, exp, nans): + # test different align cases for mixed positive and negative values + # also test no impact of NaNs and no_bar + expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]} + if nans: + df_mix.loc[3, :] = np.nan + expected.update({(3, 0): no_bar()}) + result = df_mix.style.bar(align=align)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "align, exp", + [ + ( + "left", + { + "index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]], + "columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]], + "none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]], + }, + ), + ( + "mid", + { + "index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]], + "columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]], + "none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]], + }, + ), + ( + "zero", + { + "index": [ + [bar_from_to(50, 66.66), bar_from_to(50, 75)], + [bar_from_to(50, 100), bar_from_to(50, 100)], + ], + "columns": [ + [bar_from_to(50, 75), bar_from_to(50, 100)], + [bar_from_to(50, 87.5), bar_from_to(50, 100)], + ], + "none": [ + [bar_from_to(50, 62.5), bar_from_to(50, 75)], + [bar_from_to(50, 87.5), bar_from_to(50, 100)], + ], + }, + ), + ( + 2, + { + "index": [ + [bar_to(50), no_bar()], + [bar_from_to(50, 100), bar_from_to(50, 100)], + ], + "columns": [ + [bar_to(50), no_bar()], + [bar_from_to(50, 75), bar_from_to(50, 100)], + ], + "none": [ + [bar_from_to(25, 50), no_bar()], + [bar_from_to(50, 75), bar_from_to(50, 100)], + ], + }, + ), + ], +) +@pytest.mark.parametrize("axis", ["index", "columns", "none"]) +def test_align_axis(align, exp, axis): + # test all axis combinations with positive values and different aligns + data = DataFrame([[1, 2], [3, 4]]) + result = ( + data.style.bar(align=align, axis=None if axis == "none" else axis) + ._compute() + .ctx + ) + expected = { + (0, 0): exp[axis][0][0], + (0, 1): exp[axis][0][1], + (1, 0): exp[axis][1][0], + (1, 1): exp[axis][1][1], + } + assert result == expected + + +@pytest.mark.parametrize( + "values, vmin, vmax", + [ + ("positive", 1.5, 2.5), + ("negative", -2.5, -1.5), + ("mixed", -2.5, 1.5), + ], +) +@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately +@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"]) +def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align): + # test that clipping occurs if any vmin > data_values or vmax < data_values + if align == "mid": # mid acts as left or right in each case + if values == "positive": + align = "left" + elif values == "negative": + align = "right" + df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values] + vmin = None if nullify == "vmin" else vmin + vmax = None if nullify == "vmax" else vmax + + clip_df = df.where(df <= (vmax if vmax else 999), other=vmax) + clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin) + + result = ( + df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"]) + ._compute() + .ctx + ) + expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "values, vmin, vmax", + [ + ("positive", 0.5, 4.5), + ("negative", -4.5, -0.5), + ("mixed", -4.5, 4.5), + ], +) +@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately +@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"]) +def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align): + # test that widening occurs if any vmax > data_values or vmin < data_values + if align == "mid": # mid acts as left or right in each case + if values == "positive": + align = "left" + elif values == "negative": + align = "right" + df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values] + vmin = None if nullify == "vmin" else vmin + vmax = None if nullify == "vmax" else vmax + + expand_df = df.copy() + expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax + + result = ( + df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"]) + ._compute() + .ctx + ) + expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx + assert result.items() <= expected.items() + + +def test_numerics(): + # test data is pre-selected for numeric values + data = DataFrame([[1, "a"], [2, "b"]]) + result = data.style.bar()._compute().ctx + assert (0, 1) not in result + assert (1, 1) not in result + + +@pytest.mark.parametrize( + "align, exp", + [ + ("left", [no_bar(), bar_to(100, "green")]), + ("right", [bar_to(100, "red"), no_bar()]), + ("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]), + ("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]), + ], +) +def test_colors_mixed(align, exp): + data = DataFrame([[-1], [3]]) + result = data.style.bar(align=align, color=["red", "green"])._compute().ctx + assert result == {(0, 0): exp[0], (1, 0): exp[1]} + + +def test_bar_align_height(): + # test when keyword height is used 'no-repeat center' and 'background-size' present + data = DataFrame([[1], [2]]) + result = data.style.bar(align="left", height=50)._compute().ctx + bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center" + expected = { + (0, 0): [("width", "10em")], + (1, 0): [ + ("width", "10em"), + ("background", bg_s), + ("background-size", "100% 50.0%"), + ], + } + assert result == expected + + +def test_bar_value_error_raises(): + df = DataFrame({"A": [-100, -60, -30, -20]}) + + msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or" + with pytest.raises(ValueError, match=msg): + df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html() + + msg = r"`width` must be a value in \[0, 100\]" + with pytest.raises(ValueError, match=msg): + df.style.bar(width=200).to_html() + + msg = r"`height` must be a value in \[0, 100\]" + with pytest.raises(ValueError, match=msg): + df.style.bar(height=200).to_html() + + +def test_bar_color_and_cmap_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = "`color` and `cmap` cannot both be given" + # Test that providing both color and cmap raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color="#d65f5f", cmap="viridis").to_html() + + +def test_bar_invalid_color_type_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = ( + r"`color` must be string or list or tuple of 2 strings," + r"\(eg: color=\['#d65f5f', '#5fba7d'\]\)" + ) + # Test that providing an invalid color type raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=123).to_html() + + # Test that providing a color list with more than two elements raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=["#d65f5f", "#5fba7d", "#abcdef"]).to_html() + + +def test_styler_bar_with_NA_values(): + df1 = DataFrame({"A": [1, 2, NA, 4]}) + df2 = DataFrame([[NA, NA], [NA, NA]]) + expected_substring = "style type=" + html_output1 = df1.style.bar(subset="A").to_html() + html_output2 = df2.style.bar(align="left", axis=None).to_html() + assert expected_substring in html_output1 + assert expected_substring in html_output2 + + +def test_style_bar_with_pyarrow_NA_values(): + data = """name,age,test1,test2,teacher + Adam,15,95.0,80,Ashby + Bob,16,81.0,82,Ashby + Dave,16,89.0,84,Jones + Fred,15,,88,Jones""" + df = read_csv(io.StringIO(data), dtype_backend="pyarrow") + expected_substring = "style type=" + html_output = df.style.bar(subset="test1").to_html() + assert expected_substring in html_output diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_exceptions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..d52e3a37e7693dadce34f73fc03a0790c7a0b4d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_exceptions.py @@ -0,0 +1,44 @@ +import pytest + +jinja2 = pytest.importorskip("jinja2") + +from pandas import ( + DataFrame, + MultiIndex, +) + +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + data=[[0, -0.609], [1, -1.228]], + columns=["A", "B"], + index=["x", "y"], + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0) + + +def test_concat_bad_columns(styler): + msg = "`other.data` must have same columns as `Styler.data" + with pytest.raises(ValueError, match=msg): + styler.concat(DataFrame([[1, 2]]).style) + + +def test_concat_bad_type(styler): + msg = "`other` must be of type `Styler`" + with pytest.raises(TypeError, match=msg): + styler.concat(DataFrame([[1, 2]])) + + +def test_concat_bad_index_levels(styler, df): + df = df.copy() + df.index = MultiIndex.from_tuples([(0, 0), (1, 1)]) + msg = "number of index levels must be same in `other`" + with pytest.raises(ValueError, match=msg): + styler.concat(df.style) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_format.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_format.py new file mode 100644 index 0000000000000000000000000000000000000000..1c84816ead140b95f14df8dbeccc83b317ac239a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_format.py @@ -0,0 +1,562 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + DataFrame, + IndexSlice, + MultiIndex, + NaT, + Timestamp, + option_context, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler +from pandas.io.formats.style_render import _str_escape + + +@pytest.fixture +def df(): + return DataFrame( + data=[[0, -0.609], [1, -1.228]], + columns=["A", "B"], + index=["x", "y"], + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0) + + +@pytest.fixture +def df_multi(): + return DataFrame( + data=np.arange(16).reshape(4, 4), + columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]), + index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]), + ) + + +@pytest.fixture +def styler_multi(df_multi): + return Styler(df_multi, uuid_len=0) + + +def test_display_format(styler): + ctx = styler.format("{:0.1f}")._translate(True, True) + assert all(["display_value" in c for c in row] for row in ctx["body"]) + assert all([len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"]) + assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3 + + +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("columns", [True, False]) +def test_display_format_index(styler, index, columns): + exp_index = ["x", "y"] + if index: + styler.format_index(lambda v: v.upper(), axis=0) # test callable + exp_index = ["X", "Y"] + + exp_columns = ["A", "B"] + if columns: + styler.format_index("*{}*", axis=1) # test string + exp_columns = ["*A*", "*B*"] + + ctx = styler._translate(True, True) + + for r, row in enumerate(ctx["body"]): + assert row[0]["display_value"] == exp_index[r] + + for c, col in enumerate(ctx["head"][1:]): + assert col["display_value"] == exp_columns[c] + + +def test_format_dict(styler): + ctx = styler.format({"A": "{:0.1f}", "B": "{0:.2%}"})._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "0.0" + assert ctx["body"][0][2]["display_value"] == "-60.90%" + + +def test_format_index_dict(styler): + ctx = styler.format_index({0: lambda v: v.upper()})._translate(True, True) + for i, val in enumerate(["X", "Y"]): + assert ctx["body"][i][0]["display_value"] == val + + +def test_format_string(styler): + ctx = styler.format("{:.2f}")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "0.00" + assert ctx["body"][0][2]["display_value"] == "-0.61" + assert ctx["body"][1][1]["display_value"] == "1.00" + assert ctx["body"][1][2]["display_value"] == "-1.23" + + +def test_format_callable(styler): + ctx = styler.format(lambda v: "neg" if v < 0 else "pos")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "pos" + assert ctx["body"][0][2]["display_value"] == "neg" + assert ctx["body"][1][1]["display_value"] == "pos" + assert ctx["body"][1][2]["display_value"] == "neg" + + +def test_format_with_na_rep(): + # GH 21527 28358 + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate(True, True) + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + +def test_format_index_with_na_rep(): + df = DataFrame([[1, 2, 3, 4, 5]], columns=["A", None, np.nan, NaT, NA]) + ctx = df.style.format_index(None, na_rep="--", axis=1)._translate(True, True) + assert ctx["head"][0][1]["display_value"] == "A" + for i in [2, 3, 4, 5]: + assert ctx["head"][0][i]["display_value"] == "--" + + +def test_format_non_numeric_na(): + # GH 21527 28358 + df = DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, NaT, Timestamp("20120101")], + } + ) + ctx = df.style.format(None, na_rep="-")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + +@pytest.mark.parametrize( + "func, attr, kwargs", + [ + ("format", "_display_funcs", {}), + ("format_index", "_display_funcs_index", {"axis": 0}), + ("format_index", "_display_funcs_columns", {"axis": 1}), + ], +) +def test_format_clear(styler, func, attr, kwargs): + assert (0, 0) not in getattr(styler, attr) # using default + getattr(styler, func)("{:.2f}", **kwargs) + assert (0, 0) in getattr(styler, attr) # formatter is specified + getattr(styler, func)(**kwargs) + assert (0, 0) not in getattr(styler, attr) # formatter cleared to default + + +@pytest.mark.parametrize( + "escape, exp", + [ + ("html", "<>&"%$#_{}~^\\~ ^ \\ "), + ( + "latex", + '<>\\&"\\%\\$\\#\\_\\{\\}\\textasciitilde \\textasciicircum ' + "\\textbackslash \\textasciitilde \\space \\textasciicircum \\space " + "\\textbackslash \\space ", + ), + ], +) +def test_format_escape_html(escape, exp): + chars = '<>&"%$#_{}~^\\~ ^ \\ ' + df = DataFrame([[chars]]) + + s = Styler(df, uuid_len=0).format("&{0}&", escape=None) + expected = f'
&{chars}&&{exp}&X&<>&">X&
+ + + + + + + + + + + + + + + + +
 A
a2.610000
b2.690000
+ + + """ + ) + assert result == expected + + +def test_w3_html_format(styler): + styler.set_uuid("").set_table_styles([{"selector": "th", "props": "att2:v2;"}]).map( + lambda x: "att1:v1;" + ).set_table_attributes('class="my-cls1" style="attr3:v3;"').set_td_classes( + DataFrame(["my-cls2"], index=["a"], columns=["A"]) + ).format( + "{:.1f}" + ).set_caption( + "A comprehensive test" + ) + expected = dedent( + """\ + + + + + + + + + + + + + + + + + + + +
A comprehensive test
 A
a2.6
b2.7
+ """ + ) + assert expected == styler.to_html() + + +def test_colspan_w3(): + # GH 36223 + df = DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]]) + styler = Styler(df, uuid="_", cell_ids=False) + assert 'l0' in styler.to_html() + + +def test_rowspan_w3(): + # GH 38533 + df = DataFrame(data=[[1, 2]], index=[["l0", "l0"], ["l1a", "l1b"]]) + styler = Styler(df, uuid="_", cell_ids=False) + assert 'l0' in styler.to_html() + + +def test_styles(styler): + styler.set_uuid("abc") + styler.set_table_styles([{"selector": "td", "props": "color: red;"}]) + result = styler.to_html(doctype_html=True) + expected = dedent( + """\ + + + + + + + + + + + + + + + + + + + + + + + + +
 A
a2.610000
b2.690000
+ + + """ + ) + assert result == expected + + +def test_doctype(styler): + result = styler.to_html(doctype_html=False) + assert "" not in result + assert "" not in result + assert "" not in result + assert "" not in result + + +def test_doctype_encoding(styler): + with option_context("styler.render.encoding", "ASCII"): + result = styler.to_html(doctype_html=True) + assert '' in result + result = styler.to_html(doctype_html=True, encoding="ANSI") + assert '' in result + + +def test_bold_headers_arg(styler): + result = styler.to_html(bold_headers=True) + assert "th {\n font-weight: bold;\n}" in result + result = styler.to_html() + assert "th {\n font-weight: bold;\n}" not in result + + +def test_caption_arg(styler): + result = styler.to_html(caption="foo bar") + assert "foo bar" in result + result = styler.to_html() + assert "foo bar" not in result + + +def test_block_names(tpl_style, tpl_table): + # catch accidental removal of a block + expected_style = { + "before_style", + "style", + "table_styles", + "before_cellstyle", + "cellstyle", + } + expected_table = { + "before_table", + "table", + "caption", + "thead", + "tbody", + "after_table", + "before_head_rows", + "head_tr", + "after_head_rows", + "before_rows", + "tr", + "after_rows", + } + result1 = set(tpl_style.blocks) + assert result1 == expected_style + + result2 = set(tpl_table.blocks) + assert result2 == expected_table + + +def test_from_custom_template_table(tmpdir): + p = tmpdir.mkdir("tpl").join("myhtml_table.tpl") + p.write( + dedent( + """\ + {% extends "html_table.tpl" %} + {% block table %} +

{{custom_title}}

+ {{ super() }} + {% endblock table %}""" + ) + ) + result = Styler.from_custom_template(str(tmpdir.join("tpl")), "myhtml_table.tpl") + assert issubclass(result, Styler) + assert result.env is not Styler.env + assert result.template_html_table is not Styler.template_html_table + styler = result(DataFrame({"A": [1, 2]})) + assert "

My Title

\n\n\n + {{ super() }} + {% endblock style %}""" + ) + ) + result = Styler.from_custom_template( + str(tmpdir.join("tpl")), html_style="myhtml_style.tpl" + ) + assert issubclass(result, Styler) + assert result.env is not Styler.env + assert result.template_html_style is not Styler.template_html_style + styler = result(DataFrame({"A": [1, 2]})) + assert '\n\nfull cap" in styler.to_html() + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +@pytest.mark.parametrize("index_name", [True, False]) +def test_sticky_basic(styler, index, columns, index_name): + if index_name: + styler.index.name = "some text" + if index: + styler.set_sticky(axis=0) + if columns: + styler.set_sticky(axis=1) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " left: 0px;\n z-index: {1};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " top: {1}px;\n z-index: {2};\n{3}}}" + ) + + res = styler.set_uuid("").to_html() + + # test index stickys over thead and tbody + assert (left_css.format("thead tr th:nth-child(1)", "3 !important") in res) is index + assert (left_css.format("tbody tr th:nth-child(1)", "1") in res) is index + + # test column stickys including if name row + assert ( + top_css.format("thead tr:nth-child(1) th", "0", "2", " height: 25px;\n") in res + ) is (columns and index_name) + assert ( + top_css.format("thead tr:nth-child(2) th", "25", "2", " height: 25px;\n") + in res + ) is (columns and index_name) + assert (top_css.format("thead tr:nth-child(1) th", "0", "2", "") in res) is ( + columns and not index_name + ) + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +def test_sticky_mi(styler_mi, index, columns): + if index: + styler_mi.set_sticky(axis=0) + if columns: + styler_mi.set_sticky(axis=1) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" + ) + + res = styler_mi.set_uuid("").to_html() + + # test the index stickys for thead and tbody over both levels + assert ( + left_css.format("thead tr th:nth-child(1)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level0", "0", "1") in res) is index + assert ( + left_css.format("thead tr th:nth-child(2)", "75", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "75", "1") in res) is index + + # test the column stickys for each level row + assert (top_css.format("thead tr:nth-child(1) th", "0", "2") in res) is columns + assert (top_css.format("thead tr:nth-child(2) th", "25", "2") in res) is columns + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +@pytest.mark.parametrize("levels", [[1], ["one"], "one"]) +def test_sticky_levels(styler_mi, index, columns, levels): + styler_mi.index.names, styler_mi.columns.names = ["zero", "one"], ["zero", "one"] + if index: + styler_mi.set_sticky(axis=0, levels=levels) + if columns: + styler_mi.set_sticky(axis=1, levels=levels) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: inherit;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" + ) + + res = styler_mi.set_uuid("").to_html() + + # test no sticking of level0 + assert "#T_ thead tr th:nth-child(1)" not in res + assert "#T_ tbody tr th.level0" not in res + assert "#T_ thead tr:nth-child(1) th" not in res + + # test sticking level1 + assert ( + left_css.format("thead tr th:nth-child(2)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "0", "1") in res) is index + assert (top_css.format("thead tr:nth-child(2) th", "0", "2") in res) is columns + + +def test_sticky_raises(styler): + with pytest.raises(ValueError, match="No axis named bad for object type DataFrame"): + styler.set_sticky(axis="bad") + + +@pytest.mark.parametrize( + "sparse_index, sparse_columns", + [(True, True), (True, False), (False, True), (False, False)], +) +def test_sparse_options(sparse_index, sparse_columns): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=ridx, columns=cidx) + styler = df.style + + default_html = styler.to_html() # defaults under pd.options to (True , True) + + with option_context( + "styler.sparse.index", sparse_index, "styler.sparse.columns", sparse_columns + ): + html1 = styler.to_html() + assert (html1 == default_html) is (sparse_index and sparse_columns) + html2 = styler.to_html(sparse_index=sparse_index, sparse_columns=sparse_columns) + assert html1 == html2 + + +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("columns", [True, False]) +def test_map_header_cell_ids(styler, index, columns): + # GH 41893 + func = lambda v: "attr: val;" + styler.uuid, styler.cell_ids = "", False + if index: + styler.map_index(func, axis="index") + if columns: + styler.map_index(func, axis="columns") + + result = styler.to_html() + + # test no data cell ids + assert '2.610000' in result + assert '2.690000' in result + + # test index header ids where needed and css styles + assert ( + 'a' in result + ) is index + assert ( + 'b' in result + ) is index + assert ("#T__level0_row0, #T__level0_row1 {\n attr: val;\n}" in result) is index + + # test column header ids where needed and css styles + assert ( + 'A' in result + ) is columns + assert ("#T__level0_col0 {\n attr: val;\n}" in result) is columns + + +@pytest.mark.parametrize("rows", [True, False]) +@pytest.mark.parametrize("cols", [True, False]) +def test_maximums(styler_mi, rows, cols): + result = styler_mi.to_html( + max_rows=2 if rows else None, + max_columns=2 if cols else None, + ) + + assert ">5" in result # [[0,1], [4,5]] always visible + assert (">8" in result) is not rows # first trimmed vertical element + assert (">2" in result) is not cols # first trimmed horizontal element + + +def test_replaced_css_class_names(): + css = { + "row_heading": "ROWHEAD", + # "col_heading": "COLHEAD", + "index_name": "IDXNAME", + # "col": "COL", + "row": "ROW", + # "col_trim": "COLTRIM", + "row_trim": "ROWTRIM", + "level": "LEVEL", + "data": "DATA", + "blank": "BLANK", + } + midx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + styler_mi = Styler( + DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=midx), + uuid_len=0, + ).set_table_styles(css_class_names=css) + styler_mi.index.names = ["n1", "n2"] + styler_mi.hide(styler_mi.index[1:], axis=0) + styler_mi.hide(styler_mi.columns[1:], axis=1) + styler_mi.map_index(lambda v: "color: red;", axis=0) + styler_mi.map_index(lambda v: "color: green;", axis=1) + styler_mi.map(lambda v: "color: blue;") + expected = dedent( + """\ + + + + + + + + + + + + + + + + + + + + + + + + + + +
 n1a
 n2c
n1n2 
ac0
+ """ + ) + result = styler_mi.to_html() + assert result == expected + + +def test_include_css_style_rules_only_for_visible_cells(styler_mi): + # GH 43619 + result = ( + styler_mi.set_uuid("") + .map(lambda v: "color: blue;") + .hide(styler_mi.data.columns[1:], axis="columns") + .hide(styler_mi.data.index[1:], axis="index") + .to_html() + ) + expected_styles = dedent( + """\ + + """ + ) + assert expected_styles in result + + +def test_include_css_style_rules_only_for_visible_index_labels(styler_mi): + # GH 43619 + result = ( + styler_mi.set_uuid("") + .map_index(lambda v: "color: blue;", axis="index") + .hide(styler_mi.data.columns, axis="columns") + .hide(styler_mi.data.index[1:], axis="index") + .to_html() + ) + expected_styles = dedent( + """\ + + """ + ) + assert expected_styles in result + + +def test_include_css_style_rules_only_for_visible_column_labels(styler_mi): + # GH 43619 + result = ( + styler_mi.set_uuid("") + .map_index(lambda v: "color: blue;", axis="columns") + .hide(styler_mi.data.columns[1:], axis="columns") + .hide(styler_mi.data.index, axis="index") + .to_html() + ) + expected_styles = dedent( + """\ + + """ + ) + assert expected_styles in result + + +def test_hiding_index_columns_multiindex_alignment(): + # gh 43644 + midx = MultiIndex.from_product( + [["i0", "j0"], ["i1"], ["i2", "j2"]], names=["i-0", "i-1", "i-2"] + ) + cidx = MultiIndex.from_product( + [["c0"], ["c1", "d1"], ["c2", "d2"]], names=["c-0", "c-1", "c-2"] + ) + df = DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=cidx) + styler = Styler(df, uuid_len=0) + styler.hide(level=1, axis=0).hide(level=0, axis=1) + styler.hide([("j0", "i1", "j2")], axis=0) + styler.hide([("c0", "d1", "d2")], axis=1) + result = styler.to_html() + expected = dedent( + """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 c-1c1d1
 c-2c2d2c2
i-0i-2   
i0i2012
j2456
j0i28910
+ """ + ) + assert result == expected + + +def test_hiding_index_columns_multiindex_trimming(): + # gh 44272 + df = DataFrame(np.arange(64).reshape(8, 8)) + df.columns = MultiIndex.from_product([[0, 1, 2, 3], [0, 1]]) + df.index = MultiIndex.from_product([[0, 1, 2, 3], [0, 1]]) + df.index.names, df.columns.names = ["a", "b"], ["c", "d"] + styler = Styler(df, cell_ids=False, uuid_len=0) + styler.hide([(0, 0), (0, 1), (1, 0)], axis=1).hide([(0, 0), (0, 1), (1, 0)], axis=0) + with option_context("styler.render.max_rows", 4, "styler.render.max_columns", 4): + result = styler.to_html() + + expected = dedent( + """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 c123
 d1010...
ab     
1127282930...
2035363738...
143444546...
3051525354...
.....................
+ """ + ) + + assert result == expected + + +@pytest.mark.parametrize("type", ["data", "index"]) +@pytest.mark.parametrize( + "text, exp, found", + [ + ("no link, just text", False, ""), + ("subdomain not www: sub.web.com", False, ""), + ("www subdomain: www.web.com other", True, "www.web.com"), + ("scheme full structure: http://www.web.com", True, "http://www.web.com"), + ("scheme no top-level: http://www.web", True, "http://www.web"), + ("no scheme, no top-level: www.web", False, "www.web"), + ("https scheme: https://www.web.com", True, "https://www.web.com"), + ("ftp scheme: ftp://www.web", True, "ftp://www.web"), + ("ftps scheme: ftps://www.web", True, "ftps://www.web"), + ("subdirectories: www.web.com/directory", True, "www.web.com/directory"), + ("Multiple domains: www.1.2.3.4", True, "www.1.2.3.4"), + ("with port: http://web.com:80", True, "http://web.com:80"), + ( + "full net_loc scheme: http://user:pass@web.com", + True, + "http://user:pass@web.com", + ), + ( + "with valid special chars: http://web.com/,.':;~!@#$*()[]", + True, + "http://web.com/,.':;~!@#$*()[]", + ), + ], +) +def test_rendered_links(type, text, exp, found): + if type == "data": + df = DataFrame([text]) + styler = df.style.format(hyperlinks="html") + else: + df = DataFrame([0], index=[text]) + styler = df.style.format_index(hyperlinks="html") + + rendered = f'{found}' + result = styler.to_html() + assert (rendered in result) is exp + assert (text in result) is not exp # test conversion done when expected and not + + +def test_multiple_rendered_links(): + links = ("www.a.b", "http://a.c", "https://a.d", "ftp://a.e") + # pylint: disable-next=consider-using-f-string + df = DataFrame(["text {} {} text {} {}".format(*links)]) + result = df.style.format(hyperlinks="html").to_html() + href = '{0}' + for link in links: + assert href.format(link) in result + assert href.format("text") not in result + + +def test_concat(styler): + other = styler.data.agg(["mean"]).style + styler.concat(other).set_uuid("X") + result = styler.to_html() + fp = "foot0_" + expected = dedent( + f"""\ + + b + 2.690000 + + + mean + 2.650000 + + + + """ + ) + assert expected in result + + +def test_concat_recursion(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["mean"]), precision=3) + styler3 = Styler(df.agg(["mean"]), precision=4) + styler1.concat(styler2.concat(styler3)).set_uuid("X") + result = styler.to_html() + # notice that the second concat (last of the output html), + # there are two `foot_` in the id and class + fp1 = "foot0_" + fp2 = "foot0_foot0_" + expected = dedent( + f"""\ + + b + 2.690000 + + + mean + 2.650 + + + mean + 2.6500 + + + + """ + ) + assert expected in result + + +def test_concat_chain(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["mean"]), precision=3) + styler3 = Styler(df.agg(["mean"]), precision=4) + styler1.concat(styler2).concat(styler3).set_uuid("X") + result = styler.to_html() + fp1 = "foot0_" + fp2 = "foot1_" + expected = dedent( + f"""\ + + b + 2.690000 + + + mean + 2.650 + + + mean + 2.6500 + + + + """ + ) + assert expected in result + + +def test_concat_combined(): + def html_lines(foot_prefix: str): + assert foot_prefix.endswith("_") or foot_prefix == "" + fp = foot_prefix + return indent( + dedent( + f"""\ + + a + 2.610000 + + + b + 2.690000 + + """ + ), + prefix=" " * 4, + ) + + df = DataFrame([[2.61], [2.69]], index=["a", "b"], columns=["A"]) + s1 = df.style.highlight_max(color="red") + s2 = df.style.highlight_max(color="green") + s3 = df.style.highlight_max(color="blue") + s4 = df.style.highlight_max(color="yellow") + + result = s1.concat(s2).concat(s3.concat(s4)).set_uuid("X").to_html() + expected_css = dedent( + """\ + + """ + ) + expected_table = ( + dedent( + """\ + + + + + + + + + """ + ) + + html_lines("") + + html_lines("foot0_") + + html_lines("foot1_") + + html_lines("foot1_foot0_") + + dedent( + """\ + +
 A
+ """ + ) + ) + assert expected_css + expected_table == result + + +def test_to_html_na_rep_non_scalar_data(datapath): + # GH47103 + df = DataFrame([{"a": 1, "b": [1, 2, 3], "c": np.nan}]) + result = df.style.format(na_rep="-").to_html(table_uuid="test") + expected = """\ + + + + + + + + + + + + + + + + + + +
 abc
01[1, 2, 3]-
+""" + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_matplotlib.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_matplotlib.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7a77f1ddb27db66a847fc1a1d87d14d95822aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_matplotlib.py @@ -0,0 +1,335 @@ +import gc + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + IndexSlice, + Series, +) + +pytest.importorskip("matplotlib") +pytest.importorskip("jinja2") + +import matplotlib as mpl + +from pandas.io.formats.style import Styler + + +@pytest.fixture(autouse=True) +def mpl_cleanup(): + # matplotlib/testing/decorators.py#L24 + # 1) Resets units registry + # 2) Resets rc_context + # 3) Closes all figures + mpl = pytest.importorskip("matplotlib") + mpl_units = pytest.importorskip("matplotlib.units") + plt = pytest.importorskip("matplotlib.pyplot") + orig_units_registry = mpl_units.registry.copy() + with mpl.rc_context(): + mpl.use("template") + yield + mpl_units.registry.clear() + mpl_units.registry.update(orig_units_registry) + plt.close("all") + # https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501 + gc.collect(1) + + +@pytest.fixture +def df(): + return DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0) + + +@pytest.fixture +def df_blank(): + return DataFrame([[0, 0], [0, 0]], columns=["A", "B"], index=["X", "Y"]) + + +@pytest.fixture +def styler_blank(df_blank): + return Styler(df_blank, uuid_len=0) + + +@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"]) +def test_function_gradient(styler, f): + for c_map in [None, "YlOrRd"]: + result = getattr(styler, f)(cmap=c_map)._compute().ctx + assert all("#" in x[0][1] for x in result.values()) + assert result[(0, 0)] == result[(0, 1)] + assert result[(1, 0)] == result[(1, 1)] + + +@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"]) +def test_background_gradient_color(styler, f): + result = getattr(styler, f)(subset=IndexSlice[1, "A"])._compute().ctx + if f == "background_gradient": + assert result[(1, 0)] == [("background-color", "#fff7fb"), ("color", "#000000")] + elif f == "text_gradient": + assert result[(1, 0)] == [("color", "#fff7fb")] + + +@pytest.mark.parametrize( + "axis, expected", + [ + (0, ["low", "low", "high", "high"]), + (1, ["low", "high", "low", "high"]), + (None, ["low", "mid", "mid", "high"]), + ], +) +@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"]) +def test_background_gradient_axis(styler, axis, expected, f): + if f == "background_gradient": + colors = { + "low": [("background-color", "#f7fbff"), ("color", "#000000")], + "mid": [("background-color", "#abd0e6"), ("color", "#000000")], + "high": [("background-color", "#08306b"), ("color", "#f1f1f1")], + } + elif f == "text_gradient": + colors = { + "low": [("color", "#f7fbff")], + "mid": [("color", "#abd0e6")], + "high": [("color", "#08306b")], + } + result = getattr(styler, f)(cmap="Blues", axis=axis)._compute().ctx + for i, cell in enumerate([(0, 0), (0, 1), (1, 0), (1, 1)]): + assert result[cell] == colors[expected[i]] + + +@pytest.mark.parametrize( + "cmap, expected", + [ + ( + "PuBu", + { + (4, 5): [("background-color", "#86b0d3"), ("color", "#000000")], + (4, 6): [("background-color", "#83afd3"), ("color", "#f1f1f1")], + }, + ), + ( + "YlOrRd", + { + (4, 8): [("background-color", "#fd913e"), ("color", "#000000")], + (4, 9): [("background-color", "#fd8f3d"), ("color", "#f1f1f1")], + }, + ), + ( + None, + { + (7, 0): [("background-color", "#48c16e"), ("color", "#f1f1f1")], + (7, 1): [("background-color", "#4cc26c"), ("color", "#000000")], + }, + ), + ], +) +def test_text_color_threshold(cmap, expected): + # GH 39888 + df = DataFrame(np.arange(100).reshape(10, 10)) + result = df.style.background_gradient(cmap=cmap, axis=None)._compute().ctx + for k in expected.keys(): + assert result[k] == expected[k] + + +def test_background_gradient_vmin_vmax(): + # GH 12145 + df = DataFrame(range(5)) + ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx + assert ctx[(0, 0)] == ctx[(1, 0)] + assert ctx[(4, 0)] == ctx[(3, 0)] + + +def test_background_gradient_int64(): + # GH 28869 + df1 = Series(range(3)).to_frame() + df2 = Series(range(3), dtype="Int64").to_frame() + ctx1 = df1.style.background_gradient()._compute().ctx + ctx2 = df2.style.background_gradient()._compute().ctx + assert ctx2[(0, 0)] == ctx1[(0, 0)] + assert ctx2[(1, 0)] == ctx1[(1, 0)] + assert ctx2[(2, 0)] == ctx1[(2, 0)] + + +@pytest.mark.parametrize( + "axis, gmap, expected", + [ + ( + 0, + [1, 2], + { + (0, 0): [("background-color", "#fff7fb"), ("color", "#000000")], + (1, 0): [("background-color", "#023858"), ("color", "#f1f1f1")], + (0, 1): [("background-color", "#fff7fb"), ("color", "#000000")], + (1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")], + }, + ), + ( + 1, + [1, 2], + { + (0, 0): [("background-color", "#fff7fb"), ("color", "#000000")], + (1, 0): [("background-color", "#fff7fb"), ("color", "#000000")], + (0, 1): [("background-color", "#023858"), ("color", "#f1f1f1")], + (1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")], + }, + ), + ( + None, + np.array([[2, 1], [1, 2]]), + { + (0, 0): [("background-color", "#023858"), ("color", "#f1f1f1")], + (1, 0): [("background-color", "#fff7fb"), ("color", "#000000")], + (0, 1): [("background-color", "#fff7fb"), ("color", "#000000")], + (1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")], + }, + ), + ], +) +def test_background_gradient_gmap_array(styler_blank, axis, gmap, expected): + # tests when gmap is given as a sequence and converted to ndarray + result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute().ctx + assert result == expected + + +@pytest.mark.parametrize( + "gmap, axis", [([1, 2, 3], 0), ([1, 2], 1), (np.array([[1, 2], [1, 2]]), None)] +) +def test_background_gradient_gmap_array_raises(gmap, axis): + # test when gmap as converted ndarray is bad shape + df = DataFrame([[0, 0, 0], [0, 0, 0]]) + msg = "supplied 'gmap' is not correct shape" + with pytest.raises(ValueError, match=msg): + df.style.background_gradient(gmap=gmap, axis=axis)._compute() + + +@pytest.mark.parametrize( + "gmap", + [ + DataFrame( # reverse the columns + [[2, 1], [1, 2]], columns=["B", "A"], index=["X", "Y"] + ), + DataFrame( # reverse the index + [[2, 1], [1, 2]], columns=["A", "B"], index=["Y", "X"] + ), + DataFrame( # reverse the index and columns + [[1, 2], [2, 1]], columns=["B", "A"], index=["Y", "X"] + ), + DataFrame( # add unnecessary columns + [[1, 2, 3], [2, 1, 3]], columns=["A", "B", "C"], index=["X", "Y"] + ), + DataFrame( # add unnecessary index + [[1, 2], [2, 1], [3, 3]], columns=["A", "B"], index=["X", "Y", "Z"] + ), + ], +) +@pytest.mark.parametrize( + "subset, exp_gmap", # exp_gmap is underlying map DataFrame should conform to + [ + (None, [[1, 2], [2, 1]]), + (["A"], [[1], [2]]), # slice only column "A" in data and gmap + (["B", "A"], [[2, 1], [1, 2]]), # reverse the columns in data + (IndexSlice["X", :], [[1, 2]]), # slice only index "X" in data and gmap + (IndexSlice[["Y", "X"], :], [[2, 1], [1, 2]]), # reverse the index in data + ], +) +def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, exp_gmap): + # test gmap given as DataFrame that it aligns to the data including subset + expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap, subset=subset) + result = styler_blank.background_gradient(axis=None, gmap=gmap, subset=subset) + assert expected._compute().ctx == result._compute().ctx + + +@pytest.mark.parametrize( + "gmap, axis, exp_gmap", + [ + (Series([2, 1], index=["Y", "X"]), 0, [[1, 1], [2, 2]]), # revrse the index + (Series([2, 1], index=["B", "A"]), 1, [[1, 2], [1, 2]]), # revrse the cols + (Series([1, 2, 3], index=["X", "Y", "Z"]), 0, [[1, 1], [2, 2]]), # add idx + (Series([1, 2, 3], index=["A", "B", "C"]), 1, [[1, 2], [1, 2]]), # add col + ], +) +def test_background_gradient_gmap_series_align(styler_blank, gmap, axis, exp_gmap): + # test gmap given as Series that it aligns to the data including subset + expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap)._compute() + result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute() + assert expected.ctx == result.ctx + + +@pytest.mark.parametrize( + "gmap, axis", + [ + (DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 1), + (DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 0), + ], +) +def test_background_gradient_gmap_wrong_dataframe(styler_blank, gmap, axis): + # test giving a gmap in DataFrame but with wrong axis + msg = "'gmap' is a DataFrame but underlying data for operations is a Series" + with pytest.raises(ValueError, match=msg): + styler_blank.background_gradient(gmap=gmap, axis=axis)._compute() + + +def test_background_gradient_gmap_wrong_series(styler_blank): + # test giving a gmap in Series form but with wrong axis + msg = "'gmap' is a Series but underlying data for operations is a DataFrame" + gmap = Series([1, 2], index=["X", "Y"]) + with pytest.raises(ValueError, match=msg): + styler_blank.background_gradient(gmap=gmap, axis=None)._compute() + + +def test_background_gradient_nullable_dtypes(): + # GH 50712 + df1 = DataFrame([[1], [0], [np.nan]], dtype=float) + df2 = DataFrame([[1], [0], [None]], dtype="Int64") + + ctx1 = df1.style.background_gradient()._compute().ctx + ctx2 = df2.style.background_gradient()._compute().ctx + assert ctx1 == ctx2 + + +@pytest.mark.parametrize( + "cmap", + ["PuBu", mpl.colormaps["PuBu"]], +) +def test_bar_colormap(cmap): + data = DataFrame([[1, 2], [3, 4]]) + ctx = data.style.bar(cmap=cmap, axis=None)._compute().ctx + pubu_colors = { + (0, 0): "#d0d1e6", + (1, 0): "#056faf", + (0, 1): "#73a9cf", + (1, 1): "#023858", + } + for k, v in pubu_colors.items(): + assert v in ctx[k][1][1] + + +def test_bar_color_raises(df): + msg = "`color` must be string or list or tuple of 2 strings" + with pytest.raises(ValueError, match=msg): + df.style.bar(color={"a", "b"}).to_html() + with pytest.raises(ValueError, match=msg): + df.style.bar(color=["a", "b", "c"]).to_html() + + msg = "`color` and `cmap` cannot both be given" + with pytest.raises(ValueError, match=msg): + df.style.bar(color="something", cmap="something else").to_html() + + +@pytest.mark.parametrize( + "plot_method", + ["scatter", "hexbin"], +) +def test_pass_colormap_instance(df, plot_method): + # https://github.com/pandas-dev/pandas/issues/49374 + cmap = mpl.colors.ListedColormap([[1, 1, 1], [0, 0, 0]]) + df["c"] = df.A + df.B + kwargs = {"x": "A", "y": "B", "c": "c", "colormap": cmap} + if plot_method == "hexbin": + kwargs["C"] = kwargs.pop("c") + getattr(df.plot, plot_method)(**kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_non_unique.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_non_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..e4d31fe21f2c9cf3454a67f8c7443382f7f1c0ef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_non_unique.py @@ -0,0 +1,140 @@ +from textwrap import dedent + +import pytest + +from pandas import ( + DataFrame, + IndexSlice, +) + +pytest.importorskip("jinja2") + +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["i", "j", "j"], + columns=["c", "d", "d"], + dtype=float, + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0) + + +def test_format_non_unique(df): + # GH 41269 + + # test dict + html = df.style.format({"d": "{:.1f}"}).to_html() + for val in ["1.000000<", "4.000000<", "7.000000<"]: + assert val in html + for val in ["2.0<", "3.0<", "5.0<", "6.0<", "8.0<", "9.0<"]: + assert val in html + + # test subset + html = df.style.format(precision=1, subset=IndexSlice["j", "d"]).to_html() + for val in ["1.000000<", "4.000000<", "7.000000<", "2.000000<", "3.000000<"]: + assert val in html + for val in ["5.0<", "6.0<", "8.0<", "9.0<"]: + assert val in html + + +@pytest.mark.parametrize("func", ["apply", "map"]) +def test_apply_map_non_unique_raises(df, func): + # GH 41269 + if func == "apply": + op = lambda s: ["color: red;"] * len(s) + else: + op = lambda v: "color: red;" + + with pytest.raises(KeyError, match="`Styler.apply` and `.map` are not"): + getattr(df.style, func)(op)._compute() + + +def test_table_styles_dict_non_unique_index(styler): + styles = styler.set_table_styles( + {"j": [{"selector": "td", "props": "a: v;"}]}, axis=1 + ).table_styles + assert styles == [ + {"selector": "td.row1", "props": [("a", "v")]}, + {"selector": "td.row2", "props": [("a", "v")]}, + ] + + +def test_table_styles_dict_non_unique_columns(styler): + styles = styler.set_table_styles( + {"d": [{"selector": "td", "props": "a: v;"}]}, axis=0 + ).table_styles + assert styles == [ + {"selector": "td.col1", "props": [("a", "v")]}, + {"selector": "td.col2", "props": [("a", "v")]}, + ] + + +def test_tooltips_non_unique_raises(styler): + # ttips has unique keys + ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"]) + styler.set_tooltips(ttips=ttips) # OK + + # ttips has non-unique columns + ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"]) + with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"): + styler.set_tooltips(ttips=ttips) + + # ttips has non-unique index + ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"]) + with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"): + styler.set_tooltips(ttips=ttips) + + +def test_set_td_classes_non_unique_raises(styler): + # classes has unique keys + classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"]) + styler.set_td_classes(classes=classes) # OK + + # classes has non-unique columns + classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"]) + with pytest.raises(KeyError, match="Classes render only if `classes` has unique"): + styler.set_td_classes(classes=classes) + + # classes has non-unique index + classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"]) + with pytest.raises(KeyError, match="Classes render only if `classes` has unique"): + styler.set_td_classes(classes=classes) + + +def test_hide_columns_non_unique(styler): + ctx = styler.hide(["d"], axis="columns")._translate(True, True) + + assert ctx["head"][0][1]["display_value"] == "c" + assert ctx["head"][0][1]["is_visible"] is True + + assert ctx["head"][0][2]["display_value"] == "d" + assert ctx["head"][0][2]["is_visible"] is False + + assert ctx["head"][0][3]["display_value"] == "d" + assert ctx["head"][0][3]["is_visible"] is False + + assert ctx["body"][0][1]["is_visible"] is True + assert ctx["body"][0][2]["is_visible"] is False + assert ctx["body"][0][3]["is_visible"] is False + + +def test_latex_non_unique(styler): + result = styler.to_latex() + assert result == dedent( + """\ + \\begin{tabular}{lrrr} + & c & d & d \\\\ + i & 1.000000 & 2.000000 & 3.000000 \\\\ + j & 4.000000 & 5.000000 & 6.000000 \\\\ + j & 7.000000 & 8.000000 & 9.000000 \\\\ + \\end{tabular} + """ + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_style.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_style.py new file mode 100644 index 0000000000000000000000000000000000000000..6fa72bd48031cca999b81cccfcedafcd3abcd924 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_style.py @@ -0,0 +1,1588 @@ +import contextlib +import copy +import re +from textwrap import dedent + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + IndexSlice, + MultiIndex, + Series, + option_context, +) +import pandas._testing as tm + +jinja2 = pytest.importorskip("jinja2") +from pandas.io.formats.style import ( # isort:skip + Styler, +) +from pandas.io.formats.style_render import ( + _get_level_lengths, + _get_trimming_maximums, + maybe_convert_css_to_tuples, + non_reducing_slice, +) + + +@pytest.fixture +def mi_df(): + return DataFrame( + [[1, 2], [3, 4]], + index=MultiIndex.from_product([["i0"], ["i1_a", "i1_b"]]), + columns=MultiIndex.from_product([["c0"], ["c1_a", "c1_b"]]), + dtype=int, + ) + + +@pytest.fixture +def mi_styler(mi_df): + return Styler(mi_df, uuid_len=0) + + +@pytest.fixture +def mi_styler_comp(mi_styler): + # comprehensively add features to mi_styler + mi_styler = mi_styler._copy(deepcopy=True) + mi_styler.css = {**mi_styler.css, "row": "ROW", "col": "COL"} + mi_styler.uuid_len = 5 + mi_styler.uuid = "abcde" + mi_styler.set_caption("capt") + mi_styler.set_table_styles([{"selector": "a", "props": "a:v;"}]) + mi_styler.hide(axis="columns") + mi_styler.hide([("c0", "c1_a")], axis="columns", names=True) + mi_styler.hide(axis="index") + mi_styler.hide([("i0", "i1_a")], axis="index", names=True) + mi_styler.set_table_attributes('class="box"') + other = mi_styler.data.agg(["mean"]) + other.index = MultiIndex.from_product([[""], other.index]) + mi_styler.concat(other.style) + mi_styler.format(na_rep="MISSING", precision=3) + mi_styler.format_index(precision=2, axis=0) + mi_styler.format_index(precision=4, axis=1) + mi_styler.highlight_max(axis=None) + mi_styler.map_index(lambda x: "color: white;", axis=0) + mi_styler.map_index(lambda x: "color: black;", axis=1) + mi_styler.set_td_classes( + DataFrame( + [["a", "b"], ["a", "c"]], index=mi_styler.index, columns=mi_styler.columns + ) + ) + mi_styler.set_tooltips( + DataFrame( + [["a2", "b2"], ["a2", "c2"]], + index=mi_styler.index, + columns=mi_styler.columns, + ) + ) + return mi_styler + + +@pytest.fixture +def blank_value(): + return " " + + +@pytest.fixture +def df(): + df = DataFrame({"A": [0, 1], "B": np.random.default_rng(2).standard_normal(2)}) + return df + + +@pytest.fixture +def styler(df): + df = DataFrame({"A": [0, 1], "B": np.random.default_rng(2).standard_normal(2)}) + return Styler(df) + + +@pytest.mark.parametrize( + "sparse_columns, exp_cols", + [ + ( + True, + [ + {"is_visible": True, "attributes": 'colspan="2"', "value": "c0"}, + {"is_visible": False, "attributes": "", "value": "c0"}, + ], + ), + ( + False, + [ + {"is_visible": True, "attributes": "", "value": "c0"}, + {"is_visible": True, "attributes": "", "value": "c0"}, + ], + ), + ], +) +def test_mi_styler_sparsify_columns(mi_styler, sparse_columns, exp_cols): + exp_l1_c0 = {"is_visible": True, "attributes": "", "display_value": "c1_a"} + exp_l1_c1 = {"is_visible": True, "attributes": "", "display_value": "c1_b"} + + ctx = mi_styler._translate(True, sparse_columns) + + assert exp_cols[0].items() <= ctx["head"][0][2].items() + assert exp_cols[1].items() <= ctx["head"][0][3].items() + assert exp_l1_c0.items() <= ctx["head"][1][2].items() + assert exp_l1_c1.items() <= ctx["head"][1][3].items() + + +@pytest.mark.parametrize( + "sparse_index, exp_rows", + [ + ( + True, + [ + {"is_visible": True, "attributes": 'rowspan="2"', "value": "i0"}, + {"is_visible": False, "attributes": "", "value": "i0"}, + ], + ), + ( + False, + [ + {"is_visible": True, "attributes": "", "value": "i0"}, + {"is_visible": True, "attributes": "", "value": "i0"}, + ], + ), + ], +) +def test_mi_styler_sparsify_index(mi_styler, sparse_index, exp_rows): + exp_l1_r0 = {"is_visible": True, "attributes": "", "display_value": "i1_a"} + exp_l1_r1 = {"is_visible": True, "attributes": "", "display_value": "i1_b"} + + ctx = mi_styler._translate(sparse_index, True) + + assert exp_rows[0].items() <= ctx["body"][0][0].items() + assert exp_rows[1].items() <= ctx["body"][1][0].items() + assert exp_l1_r0.items() <= ctx["body"][0][1].items() + assert exp_l1_r1.items() <= ctx["body"][1][1].items() + + +def test_mi_styler_sparsify_options(mi_styler): + with option_context("styler.sparse.index", False): + html1 = mi_styler.to_html() + with option_context("styler.sparse.index", True): + html2 = mi_styler.to_html() + + assert html1 != html2 + + with option_context("styler.sparse.columns", False): + html1 = mi_styler.to_html() + with option_context("styler.sparse.columns", True): + html2 = mi_styler.to_html() + + assert html1 != html2 + + +@pytest.mark.parametrize( + "rn, cn, max_els, max_rows, max_cols, exp_rn, exp_cn", + [ + (100, 100, 100, None, None, 12, 6), # reduce to (12, 6) < 100 elements + (1000, 3, 750, None, None, 250, 3), # dynamically reduce rows to 250, keep cols + (4, 1000, 500, None, None, 4, 125), # dynamically reduce cols to 125, keep rows + (1000, 3, 750, 10, None, 10, 3), # overwrite above dynamics with max_row + (4, 1000, 500, None, 5, 4, 5), # overwrite above dynamics with max_col + (100, 100, 700, 50, 50, 25, 25), # rows cols below given maxes so < 700 elmts + ], +) +def test_trimming_maximum(rn, cn, max_els, max_rows, max_cols, exp_rn, exp_cn): + rn, cn = _get_trimming_maximums( + rn, cn, max_els, max_rows, max_cols, scaling_factor=0.5 + ) + assert (rn, cn) == (exp_rn, exp_cn) + + +@pytest.mark.parametrize( + "option, val", + [ + ("styler.render.max_elements", 6), + ("styler.render.max_rows", 3), + ], +) +def test_render_trimming_rows(option, val): + # test auto and specific trimming of rows + df = DataFrame(np.arange(120).reshape(60, 2)) + with option_context(option, val): + ctx = df.style._translate(True, True) + assert len(ctx["head"][0]) == 3 # index + 2 data cols + assert len(ctx["body"]) == 4 # 3 data rows + trimming row + assert len(ctx["body"][0]) == 3 # index + 2 data cols + + +@pytest.mark.parametrize( + "option, val", + [ + ("styler.render.max_elements", 6), + ("styler.render.max_columns", 2), + ], +) +def test_render_trimming_cols(option, val): + # test auto and specific trimming of cols + df = DataFrame(np.arange(30).reshape(3, 10)) + with option_context(option, val): + ctx = df.style._translate(True, True) + assert len(ctx["head"][0]) == 4 # index + 2 data cols + trimming col + assert len(ctx["body"]) == 3 # 3 data rows + assert len(ctx["body"][0]) == 4 # index + 2 data cols + trimming col + + +def test_render_trimming_mi(): + midx = MultiIndex.from_product([[1, 2], [1, 2, 3]]) + df = DataFrame(np.arange(36).reshape(6, 6), columns=midx, index=midx) + with option_context("styler.render.max_elements", 4): + ctx = df.style._translate(True, True) + + assert len(ctx["body"][0]) == 5 # 2 indexes + 2 data cols + trimming row + assert {"attributes": 'rowspan="2"'}.items() <= ctx["body"][0][0].items() + assert {"class": "data row0 col_trim"}.items() <= ctx["body"][0][4].items() + assert {"class": "data row_trim col_trim"}.items() <= ctx["body"][2][4].items() + assert len(ctx["body"]) == 3 # 2 data rows + trimming row + + +def test_render_empty_mi(): + # GH 43305 + df = DataFrame(index=MultiIndex.from_product([["A"], [0, 1]], names=[None, "one"])) + expected = dedent( + """\ + > + + +   + one + + + """ + ) + assert expected in df.style.to_html() + + +@pytest.mark.parametrize("comprehensive", [True, False]) +@pytest.mark.parametrize("render", [True, False]) +@pytest.mark.parametrize("deepcopy", [True, False]) +def test_copy(comprehensive, render, deepcopy, mi_styler, mi_styler_comp): + styler = mi_styler_comp if comprehensive else mi_styler + styler.uuid_len = 5 + + s2 = copy.deepcopy(styler) if deepcopy else copy.copy(styler) # make copy and check + assert s2 is not styler + + if render: + styler.to_html() + + excl = [ + "cellstyle_map", # render time vars.. + "cellstyle_map_columns", + "cellstyle_map_index", + "template_latex", # render templates are class level + "template_html", + "template_html_style", + "template_html_table", + ] + if not deepcopy: # check memory locations are equal for all included attributes + for attr in [a for a in styler.__dict__ if (not callable(a) and a not in excl)]: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + else: # check memory locations are different for nested or mutable vars + shallow = [ + "data", + "columns", + "index", + "uuid_len", + "uuid", + "caption", + "cell_ids", + "hide_index_", + "hide_columns_", + "hide_index_names", + "hide_column_names", + "table_attributes", + ] + for attr in shallow: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + + for attr in [ + a + for a in styler.__dict__ + if (not callable(a) and a not in excl and a not in shallow) + ]: + if getattr(s2, attr) is None: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + else: + assert id(getattr(s2, attr)) != id(getattr(styler, attr)) + + +@pytest.mark.parametrize("deepcopy", [True, False]) +def test_inherited_copy(mi_styler, deepcopy): + # Ensure that the inherited class is preserved when a Styler object is copied. + # GH 52728 + class CustomStyler(Styler): + pass + + custom_styler = CustomStyler(mi_styler.data) + custom_styler_copy = ( + copy.deepcopy(custom_styler) if deepcopy else copy.copy(custom_styler) + ) + assert isinstance(custom_styler_copy, CustomStyler) + + +def test_clear(mi_styler_comp): + # NOTE: if this test fails for new features then 'mi_styler_comp' should be updated + # to ensure proper testing of the 'copy', 'clear', 'export' methods with new feature + # GH 40675 + styler = mi_styler_comp + styler._compute() # execute applied methods + + clean_copy = Styler(styler.data, uuid=styler.uuid) + + excl = [ + "data", + "index", + "columns", + "uuid", + "uuid_len", # uuid is set to be the same on styler and clean_copy + "cell_ids", + "cellstyle_map", # execution time only + "cellstyle_map_columns", # execution time only + "cellstyle_map_index", # execution time only + "template_latex", # render templates are class level + "template_html", + "template_html_style", + "template_html_table", + ] + # tests vars are not same vals on obj and clean copy before clear (except for excl) + for attr in [a for a in styler.__dict__ if not (callable(a) or a in excl)]: + res = getattr(styler, attr) == getattr(clean_copy, attr) + if hasattr(res, "__iter__") and len(res) > 0: + assert not all(res) # some element in iterable differs + elif hasattr(res, "__iter__") and len(res) == 0: + pass # empty array + else: + assert not res # explicit var differs + + # test vars have same vales on obj and clean copy after clearing + styler.clear() + for attr in [a for a in styler.__dict__ if not callable(a)]: + res = getattr(styler, attr) == getattr(clean_copy, attr) + assert all(res) if hasattr(res, "__iter__") else res + + +def test_export(mi_styler_comp, mi_styler): + exp_attrs = [ + "_todo", + "hide_index_", + "hide_index_names", + "hide_columns_", + "hide_column_names", + "table_attributes", + "table_styles", + "css", + ] + for attr in exp_attrs: + check = getattr(mi_styler, attr) == getattr(mi_styler_comp, attr) + assert not ( + all(check) if (hasattr(check, "__iter__") and len(check) > 0) else check + ) + + export = mi_styler_comp.export() + used = mi_styler.use(export) + for attr in exp_attrs: + check = getattr(used, attr) == getattr(mi_styler_comp, attr) + assert all(check) if (hasattr(check, "__iter__") and len(check) > 0) else check + + used.to_html() + + +def test_hide_raises(mi_styler): + msg = "`subset` and `level` cannot be passed simultaneously" + with pytest.raises(ValueError, match=msg): + mi_styler.hide(axis="index", subset="something", level="something else") + + msg = "`level` must be of type `int`, `str` or list of such" + with pytest.raises(ValueError, match=msg): + mi_styler.hide(axis="index", level={"bad": 1, "type": 2}) + + +@pytest.mark.parametrize("level", [1, "one", [1], ["one"]]) +def test_hide_index_level(mi_styler, level): + mi_styler.index.names, mi_styler.columns.names = ["zero", "one"], ["zero", "one"] + ctx = mi_styler.hide(axis="index", level=level)._translate(False, True) + assert len(ctx["head"][0]) == 3 + assert len(ctx["head"][1]) == 3 + assert len(ctx["head"][2]) == 4 + assert ctx["head"][2][0]["is_visible"] + assert not ctx["head"][2][1]["is_visible"] + + assert ctx["body"][0][0]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] + assert ctx["body"][1][0]["is_visible"] + assert not ctx["body"][1][1]["is_visible"] + + +@pytest.mark.parametrize("level", [1, "one", [1], ["one"]]) +@pytest.mark.parametrize("names", [True, False]) +def test_hide_columns_level(mi_styler, level, names): + mi_styler.columns.names = ["zero", "one"] + if names: + mi_styler.index.names = ["zero", "one"] + ctx = mi_styler.hide(axis="columns", level=level)._translate(True, False) + assert len(ctx["head"]) == (2 if names else 1) + + +@pytest.mark.parametrize("method", ["map", "apply"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +def test_apply_map_header(method, axis): + # GH 41893 + df = DataFrame({"A": [0, 0], "B": [1, 1]}, index=["C", "D"]) + func = { + "apply": lambda s: ["attr: val" if ("A" in v or "C" in v) else "" for v in s], + "map": lambda v: "attr: val" if ("A" in v or "C" in v) else "", + } + + # test execution added to todo + result = getattr(df.style, f"{method}_index")(func[method], axis=axis) + assert len(result._todo) == 1 + assert len(getattr(result, f"ctx_{axis}")) == 0 + + # test ctx object on compute + result._compute() + expected = { + (0, 0): [("attr", "val")], + } + assert getattr(result, f"ctx_{axis}") == expected + + +@pytest.mark.parametrize("method", ["apply", "map"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +def test_apply_map_header_mi(mi_styler, method, axis): + # GH 41893 + func = { + "apply": lambda s: ["attr: val;" if "b" in v else "" for v in s], + "map": lambda v: "attr: val" if "b" in v else "", + } + result = getattr(mi_styler, f"{method}_index")(func[method], axis=axis)._compute() + expected = {(1, 1): [("attr", "val")]} + assert getattr(result, f"ctx_{axis}") == expected + + +def test_apply_map_header_raises(mi_styler): + # GH 41893 + with pytest.raises(ValueError, match="No axis named bad for object type DataFrame"): + mi_styler.map_index(lambda v: "attr: val;", axis="bad")._compute() + + +class TestStyler: + def test_init_non_pandas(self): + msg = "``data`` must be a Series or DataFrame" + with pytest.raises(TypeError, match=msg): + Styler([1, 2, 3]) + + def test_init_series(self): + result = Styler(Series([1, 2])) + assert result.data.ndim == 2 + + def test_repr_html_ok(self, styler): + styler._repr_html_() + + def test_repr_html_mathjax(self, styler): + # gh-19824 / 41395 + assert "tex2jax_ignore" not in styler._repr_html_() + + with option_context("styler.html.mathjax", False): + assert "tex2jax_ignore" in styler._repr_html_() + + def test_update_ctx(self, styler): + styler._update_ctx(DataFrame({"A": ["color: red", "color: blue"]})) + expected = {(0, 0): [("color", "red")], (1, 0): [("color", "blue")]} + assert styler.ctx == expected + + def test_update_ctx_flatten_multi_and_trailing_semi(self, styler): + attrs = DataFrame({"A": ["color: red; foo: bar", "color:blue ; foo: baz;"]}) + styler._update_ctx(attrs) + expected = { + (0, 0): [("color", "red"), ("foo", "bar")], + (1, 0): [("color", "blue"), ("foo", "baz")], + } + assert styler.ctx == expected + + def test_render(self): + df = DataFrame({"A": [0, 1]}) + style = lambda x: Series(["color: red", "color: blue"], name=x.name) + s = Styler(df, uuid="AB").apply(style) + s.to_html() + # it worked? + + def test_multiple_render(self, df): + # GH 39396 + s = Styler(df, uuid_len=0).map(lambda x: "color: red;", subset=["A"]) + s.to_html() # do 2 renders to ensure css styles not duplicated + assert ( + '" in s.to_html() + ) + + def test_render_empty_dfs(self): + empty_df = DataFrame() + es = Styler(empty_df) + es.to_html() + # An index but no columns + DataFrame(columns=["a"]).style.to_html() + # A column but no index + DataFrame(index=["a"]).style.to_html() + # No IndexError raised? + + def test_render_double(self): + df = DataFrame({"A": [0, 1]}) + style = lambda x: Series( + ["color: red; border: 1px", "color: blue; border: 2px"], name=x.name + ) + s = Styler(df, uuid="AB").apply(style) + s.to_html() + # it worked? + + def test_set_properties(self): + df = DataFrame({"A": [0, 1]}) + result = df.style.set_properties(color="white", size="10px")._compute().ctx + # order is deterministic + v = [("color", "white"), ("size", "10px")] + expected = {(0, 0): v, (1, 0): v} + assert result.keys() == expected.keys() + for v1, v2 in zip(result.values(), expected.values()): + assert sorted(v1) == sorted(v2) + + def test_set_properties_subset(self): + df = DataFrame({"A": [0, 1]}) + result = ( + df.style.set_properties(subset=IndexSlice[0, "A"], color="white") + ._compute() + .ctx + ) + expected = {(0, 0): [("color", "white")]} + assert result == expected + + def test_empty_index_name_doesnt_display(self, blank_value): + # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.style._translate(True, True) + assert len(result["head"]) == 1 + expected = { + "class": "blank level0", + "type": "th", + "value": blank_value, + "is_visible": True, + "display_value": blank_value, + } + assert expected.items() <= result["head"][0][0].items() + + def test_index_name(self): + # https://github.com/pandas-dev/pandas/issues/11655 + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index("A").style._translate(True, True) + expected = { + "class": "index_name level0", + "type": "th", + "value": "A", + "is_visible": True, + "display_value": "A", + } + assert expected.items() <= result["head"][1][0].items() + + def test_numeric_columns(self): + # https://github.com/pandas-dev/pandas/issues/12125 + # smoke test for _translate + df = DataFrame({0: [1, 2, 3]}) + df.style._translate(True, True) + + def test_apply_axis(self): + df = DataFrame({"A": [0, 0], "B": [1, 1]}) + f = lambda x: [f"val: {x.max()}" for v in x] + result = df.style.apply(f, axis=1) + assert len(result._todo) == 1 + assert len(result.ctx) == 0 + result._compute() + expected = { + (0, 0): [("val", "1")], + (0, 1): [("val", "1")], + (1, 0): [("val", "1")], + (1, 1): [("val", "1")], + } + assert result.ctx == expected + + result = df.style.apply(f, axis=0) + expected = { + (0, 0): [("val", "0")], + (0, 1): [("val", "1")], + (1, 0): [("val", "0")], + (1, 1): [("val", "1")], + } + result._compute() + assert result.ctx == expected + result = df.style.apply(f) # default + result._compute() + assert result.ctx == expected + + @pytest.mark.parametrize("axis", [0, 1]) + def test_apply_series_return(self, axis): + # GH 42014 + df = DataFrame([[1, 2], [3, 4]], index=["X", "Y"], columns=["X", "Y"]) + + # test Series return where len(Series) < df.index or df.columns but labels OK + func = lambda s: Series(["color: red;"], index=["Y"]) + result = df.style.apply(func, axis=axis)._compute().ctx + assert result[(1, 1)] == [("color", "red")] + assert result[(1 - axis, axis)] == [("color", "red")] + + # test Series return where labels align but different order + func = lambda s: Series(["color: red;", "color: blue;"], index=["Y", "X"]) + result = df.style.apply(func, axis=axis)._compute().ctx + assert result[(0, 0)] == [("color", "blue")] + assert result[(1, 1)] == [("color", "red")] + assert result[(1 - axis, axis)] == [("color", "red")] + assert result[(axis, 1 - axis)] == [("color", "blue")] + + @pytest.mark.parametrize("index", [False, True]) + @pytest.mark.parametrize("columns", [False, True]) + def test_apply_dataframe_return(self, index, columns): + # GH 42014 + df = DataFrame([[1, 2], [3, 4]], index=["X", "Y"], columns=["X", "Y"]) + idxs = ["X", "Y"] if index else ["Y"] + cols = ["X", "Y"] if columns else ["Y"] + df_styles = DataFrame("color: red;", index=idxs, columns=cols) + result = df.style.apply(lambda x: df_styles, axis=None)._compute().ctx + + assert result[(1, 1)] == [("color", "red")] # (Y,Y) styles always present + assert (result[(0, 1)] == [("color", "red")]) is index # (X,Y) only if index + assert (result[(1, 0)] == [("color", "red")]) is columns # (Y,X) only if cols + assert (result[(0, 0)] == [("color", "red")]) is (index and columns) # (X,X) + + @pytest.mark.parametrize( + "slice_", + [ + IndexSlice[:], + IndexSlice[:, ["A"]], + IndexSlice[[1], :], + IndexSlice[[1], ["A"]], + IndexSlice[:2, ["A", "B"]], + ], + ) + @pytest.mark.parametrize("axis", [0, 1]) + def test_apply_subset(self, slice_, axis, df): + def h(x, color="bar"): + return Series(f"color: {color}", index=x.index, name=x.name) + + result = df.style.apply(h, axis=axis, subset=slice_, color="baz")._compute().ctx + expected = { + (r, c): [("color", "baz")] + for r, row in enumerate(df.index) + for c, col in enumerate(df.columns) + if row in df.loc[slice_].index and col in df.loc[slice_].columns + } + assert result == expected + + @pytest.mark.parametrize( + "slice_", + [ + IndexSlice[:], + IndexSlice[:, ["A"]], + IndexSlice[[1], :], + IndexSlice[[1], ["A"]], + IndexSlice[:2, ["A", "B"]], + ], + ) + def test_map_subset(self, slice_, df): + result = df.style.map(lambda x: "color:baz;", subset=slice_)._compute().ctx + expected = { + (r, c): [("color", "baz")] + for r, row in enumerate(df.index) + for c, col in enumerate(df.columns) + if row in df.loc[slice_].index and col in df.loc[slice_].columns + } + assert result == expected + + @pytest.mark.parametrize( + "slice_", + [ + IndexSlice[:, IndexSlice["x", "A"]], + IndexSlice[:, IndexSlice[:, "A"]], + IndexSlice[:, IndexSlice[:, ["A", "C"]]], # missing col element + IndexSlice[IndexSlice["a", 1], :], + IndexSlice[IndexSlice[:, 1], :], + IndexSlice[IndexSlice[:, [1, 3]], :], # missing row element + IndexSlice[:, ("x", "A")], + IndexSlice[("a", 1), :], + ], + ) + def test_map_subset_multiindex(self, slice_): + # GH 19861 + # edited for GH 33562 + if ( + isinstance(slice_[-1], tuple) + and isinstance(slice_[-1][-1], list) + and "C" in slice_[-1][-1] + ): + ctx = pytest.raises(KeyError, match="C") + elif ( + isinstance(slice_[0], tuple) + and isinstance(slice_[0][1], list) + and 3 in slice_[0][1] + ): + ctx = pytest.raises(KeyError, match="3") + else: + ctx = contextlib.nullcontext() + + idx = MultiIndex.from_product([["a", "b"], [1, 2]]) + col = MultiIndex.from_product([["x", "y"], ["A", "B"]]) + df = DataFrame(np.random.default_rng(2).random((4, 4)), columns=col, index=idx) + + with ctx: + df.style.map(lambda x: "color: red;", subset=slice_).to_html() + + def test_map_subset_multiindex_code(self): + # https://github.com/pandas-dev/pandas/issues/25858 + # Checks styler.map works with multindex when codes are provided + codes = np.array([[0, 0, 1, 1], [0, 1, 0, 1]]) + columns = MultiIndex( + levels=[["a", "b"], ["%", "#"]], codes=codes, names=["", ""] + ) + df = DataFrame( + [[1, -1, 1, 1], [-1, 1, 1, 1]], index=["hello", "world"], columns=columns + ) + pct_subset = IndexSlice[:, IndexSlice[:, "%":"%"]] + + def color_negative_red(val): + color = "red" if val < 0 else "black" + return f"color: {color}" + + df.loc[pct_subset] + df.style.map(color_negative_red, subset=pct_subset) + + @pytest.mark.parametrize( + "stylefunc", ["background_gradient", "bar", "text_gradient"] + ) + def test_subset_for_boolean_cols(self, stylefunc): + # GH47838 + df = DataFrame( + [ + [1, 2], + [3, 4], + ], + columns=[False, True], + ) + styled = getattr(df.style, stylefunc)() + styled._compute() + assert set(styled.ctx) == {(0, 0), (0, 1), (1, 0), (1, 1)} + + def test_empty(self): + df = DataFrame({"A": [1, 0]}) + s = df.style + s.ctx = {(0, 0): [("color", "red")], (1, 0): [("", "")]} + + result = s._translate(True, True)["cellstyle"] + expected = [ + {"props": [("color", "red")], "selectors": ["row0_col0"]}, + {"props": [("", "")], "selectors": ["row1_col0"]}, + ] + assert result == expected + + def test_duplicate(self): + df = DataFrame({"A": [1, 0]}) + s = df.style + s.ctx = {(0, 0): [("color", "red")], (1, 0): [("color", "red")]} + + result = s._translate(True, True)["cellstyle"] + expected = [ + {"props": [("color", "red")], "selectors": ["row0_col0", "row1_col0"]} + ] + assert result == expected + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate(True, True) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_caption(self, df): + styler = Styler(df, caption="foo") + result = styler.to_html() + assert all(["caption" in result, "foo" in result]) + + styler = df.style + result = styler.set_caption("baz") + assert styler is result + assert styler.caption == "baz" + + def test_uuid(self, df): + styler = Styler(df, uuid="abc123") + result = styler.to_html() + assert "abc123" in result + + styler = df.style + result = styler.set_uuid("aaa") + assert result is styler + assert result.uuid == "aaa" + + def test_unique_id(self): + # See https://github.com/pandas-dev/pandas/issues/16780 + df = DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) + result = df.style.to_html(uuid="test") + assert "test" in result + ids = re.findall('id="(.*?)"', result) + assert np.unique(ids).size == len(ids) + + def test_table_styles(self, df): + style = [{"selector": "th", "props": [("foo", "bar")]}] # default format + styler = Styler(df, table_styles=style) + result = " ".join(styler.to_html().split()) + assert "th { foo: bar; }" in result + + styler = df.style + result = styler.set_table_styles(style) + assert styler is result + assert styler.table_styles == style + + # GH 39563 + style = [{"selector": "th", "props": "foo:bar;"}] # css string format + styler = df.style.set_table_styles(style) + result = " ".join(styler.to_html().split()) + assert "th { foo: bar; }" in result + + def test_table_styles_multiple(self, df): + ctx = df.style.set_table_styles( + [ + {"selector": "th,td", "props": "color:red;"}, + {"selector": "tr", "props": "color:green;"}, + ] + )._translate(True, True)["table_styles"] + assert ctx == [ + {"selector": "th", "props": [("color", "red")]}, + {"selector": "td", "props": [("color", "red")]}, + {"selector": "tr", "props": [("color", "green")]}, + ] + + def test_table_styles_dict_multiple_selectors(self, df): + # GH 44011 + result = df.style.set_table_styles( + { + "B": [ + {"selector": "th,td", "props": [("border-left", "2px solid black")]} + ] + } + )._translate(True, True)["table_styles"] + + expected = [ + {"selector": "th.col1", "props": [("border-left", "2px solid black")]}, + {"selector": "td.col1", "props": [("border-left", "2px solid black")]}, + ] + + assert result == expected + + def test_maybe_convert_css_to_tuples(self): + expected = [("a", "b"), ("c", "d e")] + assert maybe_convert_css_to_tuples("a:b;c:d e;") == expected + assert maybe_convert_css_to_tuples("a: b ;c: d e ") == expected + expected = [] + assert maybe_convert_css_to_tuples("") == expected + + def test_maybe_convert_css_to_tuples_err(self): + msg = "Styles supplied as string must follow CSS rule formats" + with pytest.raises(ValueError, match=msg): + maybe_convert_css_to_tuples("err") + + def test_table_attributes(self, df): + attributes = 'class="foo" data-bar' + styler = Styler(df, table_attributes=attributes) + result = styler.to_html() + assert 'class="foo" data-bar' in result + + result = df.style.set_table_attributes(attributes).to_html() + assert 'class="foo" data-bar' in result + + def test_apply_none(self): + def f(x): + return DataFrame( + np.where(x == x.max(), "color: red", ""), + index=x.index, + columns=x.columns, + ) + + result = DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx + assert result[(1, 1)] == [("color", "red")] + + def test_trim(self, df): + result = df.style.to_html() # trim=True + assert result.count("#") == 0 + + result = df.style.highlight_max().to_html() + assert result.count("#") == len(df.columns) + + def test_export(self, df, styler): + f = lambda x: "color: red" if x > 0 else "color: blue" + g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" + style1 = styler + style1.map(f).map(g, z="b").highlight_max()._compute() # = render + result = style1.export() + style2 = df.style + style2.use(result) + assert style1._todo == style2._todo + style2.to_html() + + def test_bad_apply_shape(self): + df = DataFrame([[1, 2], [3, 4]], index=["A", "B"], columns=["X", "Y"]) + + msg = "resulted in the apply method collapsing to a Series." + with pytest.raises(ValueError, match=msg): + df.style._apply(lambda x: "x") + + msg = "created invalid {} labels" + with pytest.raises(ValueError, match=msg.format("index")): + df.style._apply(lambda x: [""]) + + with pytest.raises(ValueError, match=msg.format("index")): + df.style._apply(lambda x: ["", "", "", ""]) + + with pytest.raises(ValueError, match=msg.format("index")): + df.style._apply(lambda x: Series(["a:v;", ""], index=["A", "C"]), axis=0) + + with pytest.raises(ValueError, match=msg.format("columns")): + df.style._apply(lambda x: ["", "", ""], axis=1) + + with pytest.raises(ValueError, match=msg.format("columns")): + df.style._apply(lambda x: Series(["a:v;", ""], index=["X", "Z"]), axis=1) + + msg = "returned ndarray with wrong shape" + with pytest.raises(ValueError, match=msg): + df.style._apply(lambda x: np.array([[""], [""]]), axis=None) + + def test_apply_bad_return(self): + def f(x): + return "" + + df = DataFrame([[1, 2], [3, 4]]) + msg = ( + "must return a DataFrame or ndarray when passed to `Styler.apply` " + "with axis=None" + ) + with pytest.raises(TypeError, match=msg): + df.style._apply(f, axis=None) + + @pytest.mark.parametrize("axis", ["index", "columns"]) + def test_apply_bad_labels(self, axis): + def f(x): + return DataFrame(**{axis: ["bad", "labels"]}) + + df = DataFrame([[1, 2], [3, 4]]) + msg = f"created invalid {axis} labels." + with pytest.raises(ValueError, match=msg): + df.style._apply(f, axis=None) + + def test_get_level_lengths(self): + index = MultiIndex.from_product([["a", "b"], [0, 1, 2]]) + expected = { + (0, 0): 3, + (0, 3): 3, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } + result = _get_level_lengths(index, sparsify=True, max_index=100) + tm.assert_dict_equal(result, expected) + + expected = { + (0, 0): 1, + (0, 1): 1, + (0, 2): 1, + (0, 3): 1, + (0, 4): 1, + (0, 5): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } + result = _get_level_lengths(index, sparsify=False, max_index=100) + tm.assert_dict_equal(result, expected) + + def test_get_level_lengths_un_sorted(self): + index = MultiIndex.from_arrays([[1, 1, 2, 1], ["a", "b", "b", "d"]]) + expected = { + (0, 0): 2, + (0, 2): 1, + (0, 3): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + } + result = _get_level_lengths(index, sparsify=True, max_index=100) + tm.assert_dict_equal(result, expected) + + expected = { + (0, 0): 1, + (0, 1): 1, + (0, 2): 1, + (0, 3): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + } + result = _get_level_lengths(index, sparsify=False, max_index=100) + tm.assert_dict_equal(result, expected) + + def test_mi_sparse_index_names(self, blank_value): + # Test the class names and displayed value are correct on rendering MI names + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), + ) + result = df.style._translate(True, True) + head = result["head"][1] + expected = [ + { + "class": "index_name level0", + "display_value": "idx_level_0", + "is_visible": True, + }, + { + "class": "index_name level1", + "display_value": "idx_level_1", + "is_visible": True, + }, + { + "class": "blank col0", + "display_value": blank_value, + "is_visible": True, + }, + ] + for i, expected_dict in enumerate(expected): + assert expected_dict.items() <= head[i].items() + + def test_mi_sparse_column_names(self, blank_value): + df = DataFrame( + np.arange(16).reshape(4, 4), + index=MultiIndex.from_arrays( + [["a", "a", "b", "a"], [0, 1, 1, 2]], + names=["idx_level_0", "idx_level_1"], + ), + columns=MultiIndex.from_arrays( + [["C1", "C1", "C2", "C2"], [1, 0, 1, 0]], names=["colnam_0", "colnam_1"] + ), + ) + result = Styler(df, cell_ids=False)._translate(True, True) + + for level in [0, 1]: + head = result["head"][level] + expected = [ + { + "class": "blank", + "display_value": blank_value, + "is_visible": True, + }, + { + "class": f"index_name level{level}", + "display_value": f"colnam_{level}", + "is_visible": True, + }, + ] + for i, expected_dict in enumerate(expected): + assert expected_dict.items() <= head[i].items() + + def test_hide_column_headers(self, df, styler): + ctx = styler.hide(axis="columns")._translate(True, True) + assert len(ctx["head"]) == 0 # no header entries with an unnamed index + + df.index.name = "some_name" + ctx = df.style.hide(axis="columns")._translate(True, True) + assert len(ctx["head"]) == 1 + # index names still visible, changed in #42101, reverted in 43404 + + def test_hide_single_index(self, df): + # GH 14194 + # single unnamed index + ctx = df.style._translate(True, True) + assert ctx["body"][0][0]["is_visible"] + assert ctx["head"][0][0]["is_visible"] + ctx2 = df.style.hide(axis="index")._translate(True, True) + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["head"][0][0]["is_visible"] + + # single named index + ctx3 = df.set_index("A").style._translate(True, True) + assert ctx3["body"][0][0]["is_visible"] + assert len(ctx3["head"]) == 2 # 2 header levels + assert ctx3["head"][0][0]["is_visible"] + + ctx4 = df.set_index("A").style.hide(axis="index")._translate(True, True) + assert not ctx4["body"][0][0]["is_visible"] + assert len(ctx4["head"]) == 1 # only 1 header levels + assert not ctx4["head"][0][0]["is_visible"] + + def test_hide_multiindex(self): + # GH 14194 + df = DataFrame( + {"A": [1, 2], "B": [1, 2]}, + index=MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), + ) + ctx1 = df.style._translate(True, True) + # tests for 'a' and '0' + assert ctx1["body"][0][0]["is_visible"] + assert ctx1["body"][0][1]["is_visible"] + # check for blank header rows + assert len(ctx1["head"][0]) == 4 # two visible indexes and two data columns + + ctx2 = df.style.hide(axis="index")._translate(True, True) + # tests for 'a' and '0' + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["body"][0][1]["is_visible"] + # check for blank header rows + assert len(ctx2["head"][0]) == 3 # one hidden (col name) and two data columns + assert not ctx2["head"][0][0]["is_visible"] + + def test_hide_columns_single_level(self, df): + # GH 14194 + # test hiding single column + ctx = df.style._translate(True, True) + assert ctx["head"][0][1]["is_visible"] + assert ctx["head"][0][1]["display_value"] == "A" + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][0][2]["display_value"] == "B" + assert ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + ctx = df.style.hide("A", axis="columns")._translate(True, True) + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + # test hiding multiple columns + ctx = df.style.hide(["A", "B"], axis="columns")._translate(True, True) + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["head"][0][2]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert not ctx["body"][1][2]["is_visible"] # col B, row 1 + + def test_hide_columns_index_mult_levels(self): + # GH 14194 + # setup dataframe with multiple column levels and indices + i1 = MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ) + i2 = MultiIndex.from_arrays( + [["b", "b"], [0, 1]], names=["col_level_0", "col_level_1"] + ) + df = DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) + ctx = df.style._translate(True, True) + # column headers + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][1][2]["is_visible"] + assert ctx["head"][1][3]["display_value"] == "1" + # indices + assert ctx["body"][0][0]["is_visible"] + # data + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == "3" + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == "4" + + # hide top column level, which hides both columns + ctx = df.style.hide("b", axis="columns")._translate(True, True) + assert not ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][0][0]["is_visible"] # index + + # hide first column only + ctx = df.style.hide([("b", 0)], axis="columns")._translate(True, True) + assert not ctx["head"][0][2]["is_visible"] # b + assert ctx["head"][0][3]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == "4" + + # hide second column and index + ctx = df.style.hide([("b", 1)], axis=1).hide(axis=0)._translate(True, True) + assert not ctx["body"][0][0]["is_visible"] # index + assert len(ctx["head"][0]) == 3 + assert ctx["head"][0][1]["is_visible"] # b + assert ctx["head"][1][1]["is_visible"] # 0 + assert not ctx["head"][1][2]["is_visible"] # 1 + assert not ctx["body"][1][3]["is_visible"] # 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == "3" + + # hide top row level, which hides both rows so body empty + ctx = df.style.hide("a", axis="index")._translate(True, True) + assert ctx["body"] == [] + + # hide first row only + ctx = df.style.hide(("a", 0), axis="index")._translate(True, True) + for i in [0, 1, 2, 3]: + assert "row1" in ctx["body"][0][i]["class"] # row0 not included in body + assert ctx["body"][0][i]["is_visible"] + + def test_pipe(self, df): + def set_caption_from_template(styler, a, b): + return styler.set_caption(f"Dataframe with a = {a} and b = {b}") + + styler = df.style.pipe(set_caption_from_template, "A", b="B") + assert "Dataframe with a = A and b = B" in styler.to_html() + + # Test with an argument that is a (callable, keyword_name) pair. + def f(a, b, styler): + return (a, b, styler) + + styler = df.style + result = styler.pipe((f, "styler"), a=1, b=2) + assert result == (1, 2, styler) + + def test_no_cell_ids(self): + # GH 35588 + # GH 35663 + df = DataFrame(data=[[0]]) + styler = Styler(df, uuid="_", cell_ids=False) + styler.to_html() + s = styler.to_html() # render twice to ensure ctx is not updated + assert s.find('') != -1 + + @pytest.mark.parametrize( + "classes", + [ + DataFrame( + data=[["", "test-class"], [np.nan, None]], + columns=["A", "B"], + index=["a", "b"], + ), + DataFrame(data=[["test-class"]], columns=["B"], index=["a"]), + DataFrame(data=[["test-class", "unused"]], columns=["B", "C"], index=["a"]), + ], + ) + def test_set_data_classes(self, classes): + # GH 36159 + df = DataFrame(data=[[0, 1], [2, 3]], columns=["A", "B"], index=["a", "b"]) + s = Styler(df, uuid_len=0, cell_ids=False).set_td_classes(classes).to_html() + assert '0' in s + assert '1' in s + assert '2' in s + assert '3' in s + # GH 39317 + s = Styler(df, uuid_len=0, cell_ids=True).set_td_classes(classes).to_html() + assert '0' in s + assert '1' in s + assert '2' in s + assert '3' in s + + def test_set_data_classes_reindex(self): + # GH 39317 + df = DataFrame( + data=[[0, 1, 2], [3, 4, 5], [6, 7, 8]], columns=[0, 1, 2], index=[0, 1, 2] + ) + classes = DataFrame( + data=[["mi", "ma"], ["mu", "mo"]], + columns=[0, 2], + index=[0, 2], + ) + s = Styler(df, uuid_len=0).set_td_classes(classes).to_html() + assert '0' in s + assert '2' in s + assert '4' in s + assert '6' in s + assert '8' in s + + def test_chaining_table_styles(self): + # GH 35607 + df = DataFrame(data=[[0, 1], [1, 2]], columns=["A", "B"]) + styler = df.style.set_table_styles( + [{"selector": "", "props": [("background-color", "yellow")]}] + ).set_table_styles( + [{"selector": ".col0", "props": [("background-color", "blue")]}], + overwrite=False, + ) + assert len(styler.table_styles) == 2 + + def test_column_and_row_styling(self): + # GH 35607 + df = DataFrame(data=[[0, 1], [1, 2]], columns=["A", "B"]) + s = Styler(df, uuid_len=0) + s = s.set_table_styles({"A": [{"selector": "", "props": [("color", "blue")]}]}) + assert "#T_ .col0 {\n color: blue;\n}" in s.to_html() + s = s.set_table_styles( + {0: [{"selector": "", "props": [("color", "blue")]}]}, axis=1 + ) + assert "#T_ .row0 {\n color: blue;\n}" in s.to_html() + + @pytest.mark.parametrize("len_", [1, 5, 32, 33, 100]) + def test_uuid_len(self, len_): + # GH 36345 + df = DataFrame(data=[["A"]]) + s = Styler(df, uuid_len=len_, cell_ids=False).to_html() + strt = s.find('id="T_') + end = s[strt + 6 :].find('"') + if len_ > 32: + assert end == 32 + else: + assert end == len_ + + @pytest.mark.parametrize("len_", [-2, "bad", None]) + def test_uuid_len_raises(self, len_): + # GH 36345 + df = DataFrame(data=[["A"]]) + msg = "``uuid_len`` must be an integer in range \\[0, 32\\]." + with pytest.raises(TypeError, match=msg): + Styler(df, uuid_len=len_, cell_ids=False).to_html() + + @pytest.mark.parametrize( + "slc", + [ + IndexSlice[:, :], + IndexSlice[:, 1], + IndexSlice[1, :], + IndexSlice[[1], [1]], + IndexSlice[1, [1]], + IndexSlice[[1], 1], + IndexSlice[1], + IndexSlice[1, 1], + slice(None, None, None), + [0, 1], + np.array([0, 1]), + Series([0, 1]), + ], + ) + def test_non_reducing_slice(self, slc): + df = DataFrame([[0, 1], [2, 3]]) + + tslice_ = non_reducing_slice(slc) + assert isinstance(df.loc[tslice_], DataFrame) + + @pytest.mark.parametrize("box", [list, Series, np.array]) + def test_list_slice(self, box): + # like dataframe getitem + subset = box(["A"]) + + df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) + expected = IndexSlice[:, ["A"]] + + result = non_reducing_slice(subset) + tm.assert_frame_equal(df.loc[result], df.loc[expected]) + + def test_non_reducing_slice_on_multiindex(self): + # GH 19861 + dic = { + ("a", "d"): [1, 4], + ("a", "c"): [2, 3], + ("b", "c"): [3, 2], + ("b", "d"): [4, 1], + } + df = DataFrame(dic, index=[0, 1]) + idx = IndexSlice + slice_ = idx[:, idx["b", "d"]] + tslice_ = non_reducing_slice(slice_) + + result = df.loc[tslice_] + expected = DataFrame({("b", "d"): [4, 1]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "slice_", + [ + IndexSlice[:, :], + # check cols + IndexSlice[:, IndexSlice[["a"]]], # inferred deeper need list + IndexSlice[:, IndexSlice[["a"], ["c"]]], # inferred deeper need list + IndexSlice[:, IndexSlice["a", "c", :]], + IndexSlice[:, IndexSlice["a", :, "e"]], + IndexSlice[:, IndexSlice[:, "c", "e"]], + IndexSlice[:, IndexSlice["a", ["c", "d"], :]], # check list + IndexSlice[:, IndexSlice["a", ["c", "d", "-"], :]], # don't allow missing + IndexSlice[:, IndexSlice["a", ["c", "d", "-"], "e"]], # no slice + # check rows + IndexSlice[IndexSlice[["U"]], :], # inferred deeper need list + IndexSlice[IndexSlice[["U"], ["W"]], :], # inferred deeper need list + IndexSlice[IndexSlice["U", "W", :], :], + IndexSlice[IndexSlice["U", :, "Y"], :], + IndexSlice[IndexSlice[:, "W", "Y"], :], + IndexSlice[IndexSlice[:, "W", ["Y", "Z"]], :], # check list + IndexSlice[IndexSlice[:, "W", ["Y", "Z", "-"]], :], # don't allow missing + IndexSlice[IndexSlice["U", "W", ["Y", "Z", "-"]], :], # no slice + # check simultaneous + IndexSlice[IndexSlice[:, "W", "Y"], IndexSlice["a", "c", :]], + ], + ) + def test_non_reducing_multi_slice_on_multiindex(self, slice_): + # GH 33562 + cols = MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]) + idxs = MultiIndex.from_product([["U", "V"], ["W", "X"], ["Y", "Z"]]) + df = DataFrame(np.arange(64).reshape(8, 8), columns=cols, index=idxs) + + for lvl in [0, 1]: + key = slice_[lvl] + if isinstance(key, tuple): + for subkey in key: + if isinstance(subkey, list) and "-" in subkey: + # not present in the index level, raises KeyError since 2.0 + with pytest.raises(KeyError, match="-"): + df.loc[slice_] + return + + expected = df.loc[slice_] + result = df.loc[non_reducing_slice(slice_)] + tm.assert_frame_equal(result, expected) + + +def test_hidden_index_names(mi_df): + mi_df.index.names = ["Lev0", "Lev1"] + mi_styler = mi_df.style + ctx = mi_styler._translate(True, True) + assert len(ctx["head"]) == 3 # 2 column index levels + 1 index names row + + mi_styler.hide(axis="index", names=True) + ctx = mi_styler._translate(True, True) + assert len(ctx["head"]) == 2 # index names row is unparsed + for i in range(4): + assert ctx["body"][0][i]["is_visible"] # 2 index levels + 2 data values visible + + mi_styler.hide(axis="index", level=1) + ctx = mi_styler._translate(True, True) + assert len(ctx["head"]) == 2 # index names row is still hidden + assert ctx["body"][0][0]["is_visible"] is True + assert ctx["body"][0][1]["is_visible"] is False + + +def test_hidden_column_names(mi_df): + mi_df.columns.names = ["Lev0", "Lev1"] + mi_styler = mi_df.style + ctx = mi_styler._translate(True, True) + assert ctx["head"][0][1]["display_value"] == "Lev0" + assert ctx["head"][1][1]["display_value"] == "Lev1" + + mi_styler.hide(names=True, axis="columns") + ctx = mi_styler._translate(True, True) + assert ctx["head"][0][1]["display_value"] == " " + assert ctx["head"][1][1]["display_value"] == " " + + mi_styler.hide(level=0, axis="columns") + ctx = mi_styler._translate(True, True) + assert len(ctx["head"]) == 1 # no index names and only one visible column headers + assert ctx["head"][0][1]["display_value"] == " " + + +@pytest.mark.parametrize("caption", [1, ("a", "b", "c"), (1, "s")]) +def test_caption_raises(mi_styler, caption): + msg = "`caption` must be either a string or 2-tuple of strings." + with pytest.raises(ValueError, match=msg): + mi_styler.set_caption(caption) + + +def test_hiding_headers_over_index_no_sparsify(): + # GH 43464 + midx = MultiIndex.from_product([[1, 2], ["a", "a", "b"]]) + df = DataFrame(9, index=midx, columns=[0]) + ctx = df.style._translate(False, False) + assert len(ctx["body"]) == 6 + ctx = df.style.hide((1, "a"), axis=0)._translate(False, False) + assert len(ctx["body"]) == 4 + assert "row2" in ctx["body"][0][0]["class"] + + +def test_hiding_headers_over_columns_no_sparsify(): + # GH 43464 + midx = MultiIndex.from_product([[1, 2], ["a", "a", "b"]]) + df = DataFrame(9, columns=midx, index=[0]) + ctx = df.style._translate(False, False) + for ix in [(0, 1), (0, 2), (1, 1), (1, 2)]: + assert ctx["head"][ix[0]][ix[1]]["is_visible"] is True + ctx = df.style.hide((1, "a"), axis="columns")._translate(False, False) + for ix in [(0, 1), (0, 2), (1, 1), (1, 2)]: + assert ctx["head"][ix[0]][ix[1]]["is_visible"] is False + + +def test_get_level_lengths_mi_hidden(): + # GH 43464 + index = MultiIndex.from_arrays([[1, 1, 1, 2, 2, 2], ["a", "a", "b", "a", "a", "b"]]) + expected = { + (0, 2): 1, + (0, 3): 1, + (0, 4): 1, + (0, 5): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } + result = _get_level_lengths( + index, + sparsify=False, + max_index=100, + hidden_elements=[0, 1, 0, 1], # hidden element can repeat if duplicated index + ) + tm.assert_dict_equal(result, expected) + + +def test_row_trimming_hide_index(): + # gh 43703 + df = DataFrame([[1], [2], [3], [4], [5]]) + with option_context("styler.render.max_rows", 2): + ctx = df.style.hide([0, 1], axis="index")._translate(True, True) + assert len(ctx["body"]) == 3 + for r, val in enumerate(["3", "4", "..."]): + assert ctx["body"][r][1]["display_value"] == val + + +def test_row_trimming_hide_index_mi(): + # gh 44247 + df = DataFrame([[1], [2], [3], [4], [5]]) + df.index = MultiIndex.from_product([[0], [0, 1, 2, 3, 4]]) + with option_context("styler.render.max_rows", 2): + ctx = df.style.hide([(0, 0), (0, 1)], axis="index")._translate(True, True) + assert len(ctx["body"]) == 3 + + # level 0 index headers (sparsified) + assert {"value": 0, "attributes": 'rowspan="2"', "is_visible": True}.items() <= ctx[ + "body" + ][0][0].items() + assert {"value": 0, "attributes": "", "is_visible": False}.items() <= ctx["body"][ + 1 + ][0].items() + assert {"value": "...", "is_visible": True}.items() <= ctx["body"][2][0].items() + + for r, val in enumerate(["2", "3", "..."]): + assert ctx["body"][r][1]["display_value"] == val # level 1 index headers + for r, val in enumerate(["3", "4", "..."]): + assert ctx["body"][r][2]["display_value"] == val # data values + + +def test_col_trimming_hide_columns(): + # gh 44272 + df = DataFrame([[1, 2, 3, 4, 5]]) + with option_context("styler.render.max_columns", 2): + ctx = df.style.hide([0, 1], axis="columns")._translate(True, True) + + assert len(ctx["head"][0]) == 6 # blank, [0, 1 (hidden)], [2 ,3 (visible)], + trim + for c, vals in enumerate([(1, False), (2, True), (3, True), ("...", True)]): + assert ctx["head"][0][c + 2]["value"] == vals[0] + assert ctx["head"][0][c + 2]["is_visible"] == vals[1] + + assert len(ctx["body"][0]) == 6 # index + 2 hidden + 2 visible + trimming col + + +def test_no_empty_apply(mi_styler): + # 45313 + mi_styler.apply(lambda s: ["a:v;"] * 2, subset=[False, False]) + mi_styler._compute() + + +@pytest.mark.parametrize("format", ["html", "latex", "string"]) +def test_output_buffer(mi_styler, format): + # gh 47053 + with tm.ensure_clean(f"delete_me.{format}") as f: + getattr(mi_styler, f"to_{format}")(f) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_to_latex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_to_latex.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1443c3ee66be040f668f546682924207cfd31e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/formats/style/test_to_latex.py @@ -0,0 +1,1090 @@ +from textwrap import dedent + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, + Series, + option_context, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler +from pandas.io.formats.style_render import ( + _parse_latex_cell_styles, + _parse_latex_css_conversion, + _parse_latex_header_span, + _parse_latex_table_styles, + _parse_latex_table_wrapping, +) + + +@pytest.fixture +def df(): + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) + + +@pytest.fixture +def df_ext(): + return DataFrame( + {"A": [0, 1, 2], "B": [-0.61, -1.22, -2.22], "C": ["ab", "cd", "de"]} + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0, precision=2) + + +def test_minimal_latex_tabular(styler): + expected = dedent( + """\ + \\begin{tabular}{lrrl} + & A & B & C \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{tabular} + """ + ) + assert styler.to_latex() == expected + + +def test_tabular_hrules(styler): + expected = dedent( + """\ + \\begin{tabular}{lrrl} + \\toprule + & A & B & C \\\\ + \\midrule + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\bottomrule + \\end{tabular} + """ + ) + assert styler.to_latex(hrules=True) == expected + + +def test_tabular_custom_hrules(styler): + styler.set_table_styles( + [ + {"selector": "toprule", "props": ":hline"}, + {"selector": "bottomrule", "props": ":otherline"}, + ] + ) # no midrule + expected = dedent( + """\ + \\begin{tabular}{lrrl} + \\hline + & A & B & C \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\otherline + \\end{tabular} + """ + ) + assert styler.to_latex() == expected + + +def test_column_format(styler): + # default setting is already tested in `test_latex_minimal_tabular` + styler.set_table_styles([{"selector": "column_format", "props": ":cccc"}]) + + assert "\\begin{tabular}{rrrr}" in styler.to_latex(column_format="rrrr") + styler.set_table_styles([{"selector": "column_format", "props": ":r|r|cc"}]) + assert "\\begin{tabular}{r|r|cc}" in styler.to_latex() + + +def test_siunitx_cols(styler): + expected = dedent( + """\ + \\begin{tabular}{lSSl} + {} & {A} & {B} & {C} \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{tabular} + """ + ) + assert styler.to_latex(siunitx=True) == expected + + +def test_position(styler): + assert "\\begin{table}[h!]" in styler.to_latex(position="h!") + assert "\\end{table}" in styler.to_latex(position="h!") + styler.set_table_styles([{"selector": "position", "props": ":b!"}]) + assert "\\begin{table}[b!]" in styler.to_latex() + assert "\\end{table}" in styler.to_latex() + + +@pytest.mark.parametrize("env", [None, "longtable"]) +def test_label(styler, env): + assert "\n\\label{text}" in styler.to_latex(label="text", environment=env) + styler.set_table_styles([{"selector": "label", "props": ":{more §text}"}]) + assert "\n\\label{more :text}" in styler.to_latex(environment=env) + + +def test_position_float_raises(styler): + msg = "`position_float` should be one of 'raggedright', 'raggedleft', 'centering'," + with pytest.raises(ValueError, match=msg): + styler.to_latex(position_float="bad_string") + + msg = "`position_float` cannot be used in 'longtable' `environment`" + with pytest.raises(ValueError, match=msg): + styler.to_latex(position_float="centering", environment="longtable") + + +@pytest.mark.parametrize("label", [(None, ""), ("text", "\\label{text}")]) +@pytest.mark.parametrize("position", [(None, ""), ("h!", "{table}[h!]")]) +@pytest.mark.parametrize("caption", [(None, ""), ("text", "\\caption{text}")]) +@pytest.mark.parametrize("column_format", [(None, ""), ("rcrl", "{tabular}{rcrl}")]) +@pytest.mark.parametrize("position_float", [(None, ""), ("centering", "\\centering")]) +def test_kwargs_combinations( + styler, label, position, caption, column_format, position_float +): + result = styler.to_latex( + label=label[0], + position=position[0], + caption=caption[0], + column_format=column_format[0], + position_float=position_float[0], + ) + assert label[1] in result + assert position[1] in result + assert caption[1] in result + assert column_format[1] in result + assert position_float[1] in result + + +def test_custom_table_styles(styler): + styler.set_table_styles( + [ + {"selector": "mycommand", "props": ":{myoptions}"}, + {"selector": "mycommand2", "props": ":{myoptions2}"}, + ] + ) + expected = dedent( + """\ + \\begin{table} + \\mycommand{myoptions} + \\mycommand2{myoptions2} + """ + ) + assert expected in styler.to_latex() + + +def test_cell_styling(styler): + styler.highlight_max(props="itshape:;Huge:--wrap;") + expected = dedent( + """\ + \\begin{tabular}{lrrl} + & A & B & C \\\\ + 0 & 0 & \\itshape {\\Huge -0.61} & ab \\\\ + 1 & \\itshape {\\Huge 1} & -1.22 & \\itshape {\\Huge cd} \\\\ + \\end{tabular} + """ + ) + assert expected == styler.to_latex() + + +def test_multiindex_columns(df): + cidx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df.columns = cidx + expected = dedent( + """\ + \\begin{tabular}{lrrl} + & \\multicolumn{2}{r}{A} & B \\\\ + & a & b & c \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{tabular} + """ + ) + s = df.style.format(precision=2) + assert expected == s.to_latex() + + # non-sparse + expected = dedent( + """\ + \\begin{tabular}{lrrl} + & A & A & B \\\\ + & a & b & c \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{tabular} + """ + ) + s = df.style.format(precision=2) + assert expected == s.to_latex(sparse_columns=False) + + +def test_multiindex_row(df_ext): + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index = ridx + expected = dedent( + """\ + \\begin{tabular}{llrrl} + & & A & B & C \\\\ + \\multirow[c]{2}{*}{A} & a & 0 & -0.61 & ab \\\\ + & b & 1 & -1.22 & cd \\\\ + B & c & 2 & -2.22 & de \\\\ + \\end{tabular} + """ + ) + styler = df_ext.style.format(precision=2) + result = styler.to_latex() + assert expected == result + + # non-sparse + expected = dedent( + """\ + \\begin{tabular}{llrrl} + & & A & B & C \\\\ + A & a & 0 & -0.61 & ab \\\\ + A & b & 1 & -1.22 & cd \\\\ + B & c & 2 & -2.22 & de \\\\ + \\end{tabular} + """ + ) + result = styler.to_latex(sparse_index=False) + assert expected == result + + +def test_multirow_naive(df_ext): + ridx = MultiIndex.from_tuples([("X", "x"), ("X", "y"), ("Y", "z")]) + df_ext.index = ridx + expected = dedent( + """\ + \\begin{tabular}{llrrl} + & & A & B & C \\\\ + X & x & 0 & -0.61 & ab \\\\ + & y & 1 & -1.22 & cd \\\\ + Y & z & 2 & -2.22 & de \\\\ + \\end{tabular} + """ + ) + styler = df_ext.style.format(precision=2) + result = styler.to_latex(multirow_align="naive") + assert expected == result + + +def test_multiindex_row_and_col(df_ext): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index, df_ext.columns = ridx, cidx + expected = dedent( + """\ + \\begin{tabular}{llrrl} + & & \\multicolumn{2}{l}{Z} & Y \\\\ + & & a & b & c \\\\ + \\multirow[b]{2}{*}{A} & a & 0 & -0.61 & ab \\\\ + & b & 1 & -1.22 & cd \\\\ + B & c & 2 & -2.22 & de \\\\ + \\end{tabular} + """ + ) + styler = df_ext.style.format(precision=2) + result = styler.to_latex(multirow_align="b", multicol_align="l") + assert result == expected + + # non-sparse + expected = dedent( + """\ + \\begin{tabular}{llrrl} + & & Z & Z & Y \\\\ + & & a & b & c \\\\ + A & a & 0 & -0.61 & ab \\\\ + A & b & 1 & -1.22 & cd \\\\ + B & c & 2 & -2.22 & de \\\\ + \\end{tabular} + """ + ) + result = styler.to_latex(sparse_index=False, sparse_columns=False) + assert result == expected + + +@pytest.mark.parametrize( + "multicol_align, siunitx, header", + [ + ("naive-l", False, " & A & &"), + ("naive-r", False, " & & & A"), + ("naive-l", True, "{} & {A} & {} & {}"), + ("naive-r", True, "{} & {} & {} & {A}"), + ], +) +def test_multicol_naive(df, multicol_align, siunitx, header): + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("A", "c")]) + df.columns = ridx + level1 = " & a & b & c" if not siunitx else "{} & {a} & {b} & {c}" + col_format = "lrrl" if not siunitx else "lSSl" + expected = dedent( + f"""\ + \\begin{{tabular}}{{{col_format}}} + {header} \\\\ + {level1} \\\\ + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{{tabular}} + """ + ) + styler = df.style.format(precision=2) + result = styler.to_latex(multicol_align=multicol_align, siunitx=siunitx) + assert expected == result + + +def test_multi_options(df_ext): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index, df_ext.columns = ridx, cidx + styler = df_ext.style.format(precision=2) + + expected = dedent( + """\ + & & \\multicolumn{2}{r}{Z} & Y \\\\ + & & a & b & c \\\\ + \\multirow[c]{2}{*}{A} & a & 0 & -0.61 & ab \\\\ + """ + ) + result = styler.to_latex() + assert expected in result + + with option_context("styler.latex.multicol_align", "l"): + assert " & & \\multicolumn{2}{l}{Z} & Y \\\\" in styler.to_latex() + + with option_context("styler.latex.multirow_align", "b"): + assert "\\multirow[b]{2}{*}{A} & a & 0 & -0.61 & ab \\\\" in styler.to_latex() + + +def test_multiindex_columns_hidden(): + df = DataFrame([[1, 2, 3, 4]]) + df.columns = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)]) + s = df.style + assert "{tabular}{lrrrr}" in s.to_latex() + s.set_table_styles([]) # reset the position command + s.hide([("A", 2)], axis="columns") + assert "{tabular}{lrrr}" in s.to_latex() + + +@pytest.mark.parametrize( + "option, value", + [ + ("styler.sparse.index", True), + ("styler.sparse.index", False), + ("styler.sparse.columns", True), + ("styler.sparse.columns", False), + ], +) +def test_sparse_options(df_ext, option, value): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index, df_ext.columns = ridx, cidx + styler = df_ext.style + + latex1 = styler.to_latex() + with option_context(option, value): + latex2 = styler.to_latex() + assert (latex1 == latex2) is value + + +def test_hidden_index(styler): + styler.hide(axis="index") + expected = dedent( + """\ + \\begin{tabular}{rrl} + A & B & C \\\\ + 0 & -0.61 & ab \\\\ + 1 & -1.22 & cd \\\\ + \\end{tabular} + """ + ) + assert styler.to_latex() == expected + + +@pytest.mark.parametrize("environment", ["table", "figure*", None]) +def test_comprehensive(df_ext, environment): + # test as many low level features simultaneously as possible + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index, df_ext.columns = ridx, cidx + stlr = df_ext.style + stlr.set_caption("mycap") + stlr.set_table_styles( + [ + {"selector": "label", "props": ":{fig§item}"}, + {"selector": "position", "props": ":h!"}, + {"selector": "position_float", "props": ":centering"}, + {"selector": "column_format", "props": ":rlrlr"}, + {"selector": "toprule", "props": ":toprule"}, + {"selector": "midrule", "props": ":midrule"}, + {"selector": "bottomrule", "props": ":bottomrule"}, + {"selector": "rowcolors", "props": ":{3}{pink}{}"}, # custom command + ] + ) + stlr.highlight_max(axis=0, props="textbf:--rwrap;cellcolor:[rgb]{1,1,0.6}--rwrap") + stlr.highlight_max(axis=None, props="Huge:--wrap;", subset=[("Z", "a"), ("Z", "b")]) + + expected = ( + """\ +\\begin{table}[h!] +\\centering +\\caption{mycap} +\\label{fig:item} +\\rowcolors{3}{pink}{} +\\begin{tabular}{rlrlr} +\\toprule + & & \\multicolumn{2}{r}{Z} & Y \\\\ + & & a & b & c \\\\ +\\midrule +\\multirow[c]{2}{*}{A} & a & 0 & \\textbf{\\cellcolor[rgb]{1,1,0.6}{-0.61}} & ab \\\\ + & b & 1 & -1.22 & cd \\\\ +B & c & \\textbf{\\cellcolor[rgb]{1,1,0.6}{{\\Huge 2}}} & -2.22 & """ + """\ +\\textbf{\\cellcolor[rgb]{1,1,0.6}{de}} \\\\ +\\bottomrule +\\end{tabular} +\\end{table} +""" + ).replace("table", environment if environment else "table") + result = stlr.format(precision=2).to_latex(environment=environment) + assert result == expected + + +def test_environment_option(styler): + with option_context("styler.latex.environment", "bar-env"): + assert "\\begin{bar-env}" in styler.to_latex() + assert "\\begin{foo-env}" in styler.to_latex(environment="foo-env") + + +def test_parse_latex_table_styles(styler): + styler.set_table_styles( + [ + {"selector": "foo", "props": [("attr", "value")]}, + {"selector": "bar", "props": [("attr", "overwritten")]}, + {"selector": "bar", "props": [("attr", "baz"), ("attr2", "ignored")]}, + {"selector": "label", "props": [("", "{fig§item}")]}, + ] + ) + assert _parse_latex_table_styles(styler.table_styles, "bar") == "baz" + + # test '§' replaced by ':' [for CSS compatibility] + assert _parse_latex_table_styles(styler.table_styles, "label") == "{fig:item}" + + +def test_parse_latex_cell_styles_basic(): # test nesting + cell_style = [("itshape", "--rwrap"), ("cellcolor", "[rgb]{0,1,1}--rwrap")] + expected = "\\itshape{\\cellcolor[rgb]{0,1,1}{text}}" + assert _parse_latex_cell_styles(cell_style, "text") == expected + + +@pytest.mark.parametrize( + "wrap_arg, expected", + [ # test wrapping + ("", "\\ "), + ("--wrap", "{\\ }"), + ("--nowrap", "\\ "), + ("--lwrap", "{\\} "), + ("--dwrap", "{\\}{}"), + ("--rwrap", "\\{}"), + ], +) +def test_parse_latex_cell_styles_braces(wrap_arg, expected): + cell_style = [("", f"{wrap_arg}")] + assert _parse_latex_cell_styles(cell_style, "") == expected + + +def test_parse_latex_header_span(): + cell = {"attributes": 'colspan="3"', "display_value": "text", "cellstyle": []} + expected = "\\multicolumn{3}{Y}{text}" + assert _parse_latex_header_span(cell, "X", "Y") == expected + + cell = {"attributes": 'rowspan="5"', "display_value": "text", "cellstyle": []} + expected = "\\multirow[X]{5}{*}{text}" + assert _parse_latex_header_span(cell, "X", "Y") == expected + + cell = {"display_value": "text", "cellstyle": []} + assert _parse_latex_header_span(cell, "X", "Y") == "text" + + cell = {"display_value": "text", "cellstyle": [("bfseries", "--rwrap")]} + assert _parse_latex_header_span(cell, "X", "Y") == "\\bfseries{text}" + + +def test_parse_latex_table_wrapping(styler): + styler.set_table_styles( + [ + {"selector": "toprule", "props": ":value"}, + {"selector": "bottomrule", "props": ":value"}, + {"selector": "midrule", "props": ":value"}, + {"selector": "column_format", "props": ":value"}, + ] + ) + assert _parse_latex_table_wrapping(styler.table_styles, styler.caption) is False + assert _parse_latex_table_wrapping(styler.table_styles, "some caption") is True + styler.set_table_styles( + [ + {"selector": "not-ignored", "props": ":value"}, + ], + overwrite=False, + ) + assert _parse_latex_table_wrapping(styler.table_styles, None) is True + + +def test_short_caption(styler): + result = styler.to_latex(caption=("full cap", "short cap")) + assert "\\caption[short cap]{full cap}" in result + + +@pytest.mark.parametrize( + "css, expected", + [ + ([("color", "red")], [("color", "{red}")]), # test color and input format types + ( + [("color", "rgb(128, 128, 128 )")], + [("color", "[rgb]{0.502, 0.502, 0.502}")], + ), + ( + [("color", "rgb(128, 50%, 25% )")], + [("color", "[rgb]{0.502, 0.500, 0.250}")], + ), + ( + [("color", "rgba(128,128,128,1)")], + [("color", "[rgb]{0.502, 0.502, 0.502}")], + ), + ([("color", "#FF00FF")], [("color", "[HTML]{FF00FF}")]), + ([("color", "#F0F")], [("color", "[HTML]{FF00FF}")]), + ([("font-weight", "bold")], [("bfseries", "")]), # test font-weight and types + ([("font-weight", "bolder")], [("bfseries", "")]), + ([("font-weight", "normal")], []), + ([("background-color", "red")], [("cellcolor", "{red}--lwrap")]), + ( + [("background-color", "#FF00FF")], # test background-color command and wrap + [("cellcolor", "[HTML]{FF00FF}--lwrap")], + ), + ([("font-style", "italic")], [("itshape", "")]), # test font-style and types + ([("font-style", "oblique")], [("slshape", "")]), + ([("font-style", "normal")], []), + ([("color", "red /*--dwrap*/")], [("color", "{red}--dwrap")]), # css comments + ([("background-color", "red /* --dwrap */")], [("cellcolor", "{red}--dwrap")]), + ], +) +def test_parse_latex_css_conversion(css, expected): + result = _parse_latex_css_conversion(css) + assert result == expected + + +@pytest.mark.parametrize( + "env, inner_env", + [ + (None, "tabular"), + ("table", "tabular"), + ("longtable", "longtable"), + ], +) +@pytest.mark.parametrize( + "convert, exp", [(True, "bfseries"), (False, "font-weightbold")] +) +def test_parse_latex_css_convert_minimal(styler, env, inner_env, convert, exp): + # parameters ensure longtable template is also tested + styler.highlight_max(props="font-weight:bold;") + result = styler.to_latex(convert_css=convert, environment=env) + expected = dedent( + f"""\ + 0 & 0 & \\{exp} -0.61 & ab \\\\ + 1 & \\{exp} 1 & -1.22 & \\{exp} cd \\\\ + \\end{{{inner_env}}} + """ + ) + assert expected in result + + +def test_parse_latex_css_conversion_option(): + css = [("command", "option--latex--wrap")] + expected = [("command", "option--wrap")] + result = _parse_latex_css_conversion(css) + assert result == expected + + +def test_styler_object_after_render(styler): + # GH 42320 + pre_render = styler._copy(deepcopy=True) + styler.to_latex( + column_format="rllr", + position="h", + position_float="centering", + hrules=True, + label="my lab", + caption="my cap", + ) + + assert pre_render.table_styles == styler.table_styles + assert pre_render.caption == styler.caption + + +def test_longtable_comprehensive(styler): + result = styler.to_latex( + environment="longtable", hrules=True, label="fig:A", caption=("full", "short") + ) + expected = dedent( + """\ + \\begin{longtable}{lrrl} + \\caption[short]{full} \\label{fig:A} \\\\ + \\toprule + & A & B & C \\\\ + \\midrule + \\endfirsthead + \\caption[]{full} \\\\ + \\toprule + & A & B & C \\\\ + \\midrule + \\endhead + \\midrule + \\multicolumn{4}{r}{Continued on next page} \\\\ + \\midrule + \\endfoot + \\bottomrule + \\endlastfoot + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{longtable} + """ + ) + assert result == expected + + +def test_longtable_minimal(styler): + result = styler.to_latex(environment="longtable") + expected = dedent( + """\ + \\begin{longtable}{lrrl} + & A & B & C \\\\ + \\endfirsthead + & A & B & C \\\\ + \\endhead + \\multicolumn{4}{r}{Continued on next page} \\\\ + \\endfoot + \\endlastfoot + 0 & 0 & -0.61 & ab \\\\ + 1 & 1 & -1.22 & cd \\\\ + \\end{longtable} + """ + ) + assert result == expected + + +@pytest.mark.parametrize( + "sparse, exp, siunitx", + [ + (True, "{} & \\multicolumn{2}{r}{A} & {B}", True), + (False, "{} & {A} & {A} & {B}", True), + (True, " & \\multicolumn{2}{r}{A} & B", False), + (False, " & A & A & B", False), + ], +) +def test_longtable_multiindex_columns(df, sparse, exp, siunitx): + cidx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df.columns = cidx + with_si = "{} & {a} & {b} & {c} \\\\" + without_si = " & a & b & c \\\\" + expected = dedent( + f"""\ + \\begin{{longtable}}{{l{"SS" if siunitx else "rr"}l}} + {exp} \\\\ + {with_si if siunitx else without_si} + \\endfirsthead + {exp} \\\\ + {with_si if siunitx else without_si} + \\endhead + """ + ) + result = df.style.to_latex( + environment="longtable", sparse_columns=sparse, siunitx=siunitx + ) + assert expected in result + + +@pytest.mark.parametrize( + "caption, cap_exp", + [ + ("full", ("{full}", "")), + (("full", "short"), ("{full}", "[short]")), + ], +) +@pytest.mark.parametrize("label, lab_exp", [(None, ""), ("tab:A", " \\label{tab:A}")]) +def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): + cap_exp1 = f"\\caption{cap_exp[1]}{cap_exp[0]}" + cap_exp2 = f"\\caption[]{cap_exp[0]}" + + expected = dedent( + f"""\ + {cap_exp1}{lab_exp} \\\\ + & A & B & C \\\\ + \\endfirsthead + {cap_exp2} \\\\ + """ + ) + assert expected in styler.to_latex( + environment="longtable", caption=caption, label=label + ) + + +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "columns, siunitx", + [ + (True, True), + (True, False), + (False, False), + ], +) +def test_apply_map_header_render_mi(df_ext, index, columns, siunitx): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df_ext.index, df_ext.columns = ridx, cidx + styler = df_ext.style + + func = lambda v: "bfseries: --rwrap" if "A" in v or "Z" in v or "c" in v else None + + if index: + styler.map_index(func, axis="index") + if columns: + styler.map_index(func, axis="columns") + + result = styler.to_latex(siunitx=siunitx) + + expected_index = dedent( + """\ + \\multirow[c]{2}{*}{\\bfseries{A}} & a & 0 & -0.610000 & ab \\\\ + \\bfseries{} & b & 1 & -1.220000 & cd \\\\ + B & \\bfseries{c} & 2 & -2.220000 & de \\\\ + """ + ) + assert (expected_index in result) is index + + exp_cols_si = dedent( + """\ + {} & {} & \\multicolumn{2}{r}{\\bfseries{Z}} & {Y} \\\\ + {} & {} & {a} & {b} & {\\bfseries{c}} \\\\ + """ + ) + exp_cols_no_si = """\ + & & \\multicolumn{2}{r}{\\bfseries{Z}} & Y \\\\ + & & a & b & \\bfseries{c} \\\\ +""" + assert ((exp_cols_si if siunitx else exp_cols_no_si) in result) is columns + + +def test_repr_option(styler): + assert "