| """Metrics to assess performance on classification task given scores. |
| |
| Functions named as ``*_score`` return a scalar value to maximize: the higher |
| the better. |
| |
| Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: |
| the lower the better. |
| """ |
|
|
| |
| |
|
|
|
|
| import warnings |
| from functools import partial |
| from numbers import Integral, Real |
|
|
| import numpy as np |
| from scipy.integrate import trapezoid |
| from scipy.sparse import csr_matrix, issparse |
| from scipy.stats import rankdata |
|
|
| from ..exceptions import UndefinedMetricWarning |
| from ..preprocessing import label_binarize |
| from ..utils import ( |
| assert_all_finite, |
| check_array, |
| check_consistent_length, |
| column_or_1d, |
| ) |
| from ..utils._encode import _encode, _unique |
| from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params |
| from ..utils.extmath import stable_cumsum |
| from ..utils.multiclass import type_of_target |
| from ..utils.sparsefuncs import count_nonzero |
| from ..utils.validation import _check_pos_label_consistency, _check_sample_weight |
| from ._base import _average_binary_score, _average_multiclass_ovo_score |
|
|
|
|
| @validate_params( |
| {"x": ["array-like"], "y": ["array-like"]}, |
| prefer_skip_nested_validation=True, |
| ) |
| def auc(x, y): |
| """Compute Area Under the Curve (AUC) using the trapezoidal rule. |
| |
| This is a general function, given points on a curve. For computing the |
| area under the ROC-curve, see :func:`roc_auc_score`. For an alternative |
| way to summarize a precision-recall curve, see |
| :func:`average_precision_score`. |
| |
| Parameters |
| ---------- |
| x : array-like of shape (n,) |
| X coordinates. These must be either monotonic increasing or monotonic |
| decreasing. |
| y : array-like of shape (n,) |
| Y coordinates. |
| |
| Returns |
| ------- |
| auc : float |
| Area Under the Curve. |
| |
| See Also |
| -------- |
| roc_auc_score : Compute the area under the ROC curve. |
| average_precision_score : Compute average precision from prediction scores. |
| precision_recall_curve : Compute precision-recall pairs for different |
| probability thresholds. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn import metrics |
| >>> y = np.array([1, 1, 2, 2]) |
| >>> pred = np.array([0.1, 0.4, 0.35, 0.8]) |
| >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) |
| >>> metrics.auc(fpr, tpr) |
| np.float64(0.75) |
| """ |
| check_consistent_length(x, y) |
| x = column_or_1d(x) |
| y = column_or_1d(y) |
|
|
| if x.shape[0] < 2: |
| raise ValueError( |
| "At least 2 points are needed to compute area under curve, but x.shape = %s" |
| % x.shape |
| ) |
|
|
| direction = 1 |
| dx = np.diff(x) |
| if np.any(dx < 0): |
| if np.all(dx <= 0): |
| direction = -1 |
| else: |
| raise ValueError("x is neither increasing nor decreasing : {}.".format(x)) |
|
|
| area = direction * trapezoid(y, x) |
| if isinstance(area, np.memmap): |
| |
| |
| |
| area = area.dtype.type(area) |
| return area |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "average": [StrOptions({"micro", "samples", "weighted", "macro"}), None], |
| "pos_label": [Real, str, "boolean"], |
| "sample_weight": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def average_precision_score( |
| y_true, y_score, *, average="macro", pos_label=1, sample_weight=None |
| ): |
| """Compute average precision (AP) from prediction scores. |
| |
| AP summarizes a precision-recall curve as the weighted mean of precisions |
| achieved at each threshold, with the increase in recall from the previous |
| threshold used as the weight: |
| |
| .. math:: |
| \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n |
| |
| where :math:`P_n` and :math:`R_n` are the precision and recall at the nth |
| threshold [1]_. This implementation is not interpolated and is different |
| from computing the area under the precision-recall curve with the |
| trapezoidal rule, which uses linear interpolation and can be too |
| optimistic. |
| |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) or (n_samples, n_classes) |
| True binary labels or binary label indicators. |
| |
| y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by :term:`decision_function` on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| average : {'micro', 'samples', 'weighted', 'macro'} or None, \ |
| default='macro' |
| If ``None``, the scores for each class are returned. Otherwise, |
| this determines the type of averaging performed on the data: |
| |
| ``'micro'``: |
| Calculate metrics globally by considering each element of the label |
| indicator matrix as a label. |
| ``'macro'``: |
| Calculate metrics for each label, and find their unweighted |
| mean. This does not take label imbalance into account. |
| ``'weighted'``: |
| Calculate metrics for each label, and find their average, weighted |
| by support (the number of true instances for each label). |
| ``'samples'``: |
| Calculate metrics for each instance, and find their average. |
| |
| Will be ignored when ``y_true`` is binary. |
| |
| pos_label : int, float, bool or str, default=1 |
| The label of the positive class. Only applied to binary ``y_true``. |
| For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| Returns |
| ------- |
| average_precision : float |
| Average precision score. |
| |
| See Also |
| -------- |
| roc_auc_score : Compute the area under the ROC curve. |
| precision_recall_curve : Compute precision-recall pairs for different |
| probability thresholds. |
| |
| Notes |
| ----- |
| .. versionchanged:: 0.19 |
| Instead of linearly interpolating between operating points, precisions |
| are weighted by the change in recall since the last operating point. |
| |
| References |
| ---------- |
| .. [1] `Wikipedia entry for the Average precision |
| <https://en.wikipedia.org/w/index.php?title=Information_retrieval& |
| oldid=793358396#Average_precision>`_ |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import average_precision_score |
| >>> y_true = np.array([0, 0, 1, 1]) |
| >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
| >>> average_precision_score(y_true, y_scores) |
| np.float64(0.83...) |
| >>> y_true = np.array([0, 0, 1, 1, 2, 2]) |
| >>> y_scores = np.array([ |
| ... [0.7, 0.2, 0.1], |
| ... [0.4, 0.3, 0.3], |
| ... [0.1, 0.8, 0.1], |
| ... [0.2, 0.3, 0.5], |
| ... [0.4, 0.4, 0.2], |
| ... [0.1, 0.2, 0.7], |
| ... ]) |
| >>> average_precision_score(y_true, y_scores) |
| np.float64(0.77...) |
| """ |
|
|
| def _binary_uninterpolated_average_precision( |
| y_true, y_score, pos_label=1, sample_weight=None |
| ): |
| precision, recall, _ = precision_recall_curve( |
| y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
| ) |
| |
| |
| |
| |
| return max(0.0, -np.sum(np.diff(recall) * np.array(precision)[:-1])) |
|
|
| y_type = type_of_target(y_true, input_name="y_true") |
|
|
| |
| |
| present_labels = np.unique(y_true).tolist() |
|
|
| if y_type == "binary": |
| if len(present_labels) == 2 and pos_label not in present_labels: |
| raise ValueError( |
| f"pos_label={pos_label} is not a valid label. It should be " |
| f"one of {present_labels}" |
| ) |
|
|
| elif y_type == "multilabel-indicator" and pos_label != 1: |
| raise ValueError( |
| "Parameter pos_label is fixed to 1 for multilabel-indicator y_true. " |
| "Do not set pos_label or set pos_label to 1." |
| ) |
|
|
| elif y_type == "multiclass": |
| if pos_label != 1: |
| raise ValueError( |
| "Parameter pos_label is fixed to 1 for multiclass y_true. " |
| "Do not set pos_label or set pos_label to 1." |
| ) |
| y_true = label_binarize(y_true, classes=present_labels) |
|
|
| average_precision = partial( |
| _binary_uninterpolated_average_precision, pos_label=pos_label |
| ) |
| return _average_binary_score( |
| average_precision, y_true, y_score, average, sample_weight=sample_weight |
| ) |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "pos_label": [Real, str, "boolean", None], |
| "sample_weight": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def det_curve(y_true, y_score, pos_label=None, sample_weight=None): |
| """Compute error rates for different probability thresholds. |
| |
| .. note:: |
| This metric is used for evaluation of ranking and error tradeoffs of |
| a binary classification task. |
| |
| Read more in the :ref:`User Guide <det_curve>`. |
| |
| .. versionadded:: 0.24 |
| |
| Parameters |
| ---------- |
| y_true : ndarray of shape (n_samples,) |
| True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
| pos_label should be explicitly given. |
| |
| y_score : ndarray of shape of (n_samples,) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by "decision_function" on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| pos_label : int, float, bool or str, default=None |
| The label of the positive class. |
| When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, |
| ``pos_label`` is set to 1, otherwise an error will be raised. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| Returns |
| ------- |
| fpr : ndarray of shape (n_thresholds,) |
| False positive rate (FPR) such that element i is the false positive |
| rate of predictions with score >= thresholds[i]. This is occasionally |
| referred to as false acceptance probability or fall-out. |
| |
| fnr : ndarray of shape (n_thresholds,) |
| False negative rate (FNR) such that element i is the false negative |
| rate of predictions with score >= thresholds[i]. This is occasionally |
| referred to as false rejection or miss rate. |
| |
| thresholds : ndarray of shape (n_thresholds,) |
| Decreasing score values. |
| |
| See Also |
| -------- |
| DetCurveDisplay.from_estimator : Plot DET curve given an estimator and |
| some data. |
| DetCurveDisplay.from_predictions : Plot DET curve given the true and |
| predicted labels. |
| DetCurveDisplay : DET curve visualization. |
| roc_curve : Compute Receiver operating characteristic (ROC) curve. |
| precision_recall_curve : Compute precision-recall curve. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import det_curve |
| >>> y_true = np.array([0, 0, 1, 1]) |
| >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
| >>> fpr, fnr, thresholds = det_curve(y_true, y_scores) |
| >>> fpr |
| array([0.5, 0.5, 0. ]) |
| >>> fnr |
| array([0. , 0.5, 0.5]) |
| >>> thresholds |
| array([0.35, 0.4 , 0.8 ]) |
| """ |
| fps, tps, thresholds = _binary_clf_curve( |
| y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
| ) |
|
|
| if len(np.unique(y_true)) != 2: |
| raise ValueError( |
| "Only one class is present in y_true. Detection error " |
| "tradeoff curve is not defined in that case." |
| ) |
|
|
| fns = tps[-1] - tps |
| p_count = tps[-1] |
| n_count = fps[-1] |
|
|
| |
| first_ind = ( |
| fps.searchsorted(fps[0], side="right") - 1 |
| if fps.searchsorted(fps[0], side="right") > 0 |
| else None |
| ) |
| |
| last_ind = tps.searchsorted(tps[-1]) + 1 |
| sl = slice(first_ind, last_ind) |
|
|
| |
| return (fps[sl][::-1] / n_count, fns[sl][::-1] / p_count, thresholds[sl][::-1]) |
|
|
|
|
| def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): |
| """Binary roc auc score.""" |
| if len(np.unique(y_true)) != 2: |
| warnings.warn( |
| ( |
| "Only one class is present in y_true. ROC AUC score " |
| "is not defined in that case." |
| ), |
| UndefinedMetricWarning, |
| ) |
| return np.nan |
|
|
| fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) |
| if max_fpr is None or max_fpr == 1: |
| return auc(fpr, tpr) |
| if max_fpr <= 0 or max_fpr > 1: |
| raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr) |
|
|
| |
| stop = np.searchsorted(fpr, max_fpr, "right") |
| x_interp = [fpr[stop - 1], fpr[stop]] |
| y_interp = [tpr[stop - 1], tpr[stop]] |
| tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp)) |
| fpr = np.append(fpr[:stop], max_fpr) |
| partial_auc = auc(fpr, tpr) |
|
|
| |
| |
| min_area = 0.5 * max_fpr**2 |
| max_area = max_fpr |
| return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "average": [StrOptions({"micro", "macro", "samples", "weighted"}), None], |
| "sample_weight": ["array-like", None], |
| "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], |
| "multi_class": [StrOptions({"raise", "ovr", "ovo"})], |
| "labels": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def roc_auc_score( |
| y_true, |
| y_score, |
| *, |
| average="macro", |
| sample_weight=None, |
| max_fpr=None, |
| multi_class="raise", |
| labels=None, |
| ): |
| """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ |
| from prediction scores. |
| |
| Note: this implementation can be used with binary, multiclass and |
| multilabel classification, but some restrictions apply (see Parameters). |
| |
| Read more in the :ref:`User Guide <roc_metrics>`. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) or (n_samples, n_classes) |
| True labels or binary label indicators. The binary and multiclass cases |
| expect labels with shape (n_samples,) while the multilabel case expects |
| binary label indicators with shape (n_samples, n_classes). |
| |
| y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
| Target scores. |
| |
| * In the binary case, it corresponds to an array of shape |
| `(n_samples,)`. Both probability estimates and non-thresholded |
| decision values can be provided. The probability estimates correspond |
| to the **probability of the class with the greater label**, |
| i.e. `estimator.classes_[1]` and thus |
| `estimator.predict_proba(X, y)[:, 1]`. The decision values |
| corresponds to the output of `estimator.decision_function(X, y)`. |
| See more information in the :ref:`User guide <roc_auc_binary>`; |
| * In the multiclass case, it corresponds to an array of shape |
| `(n_samples, n_classes)` of probability estimates provided by the |
| `predict_proba` method. The probability estimates **must** |
| sum to 1 across the possible classes. In addition, the order of the |
| class scores must correspond to the order of ``labels``, |
| if provided, or else to the numerical or lexicographical order of |
| the labels in ``y_true``. See more information in the |
| :ref:`User guide <roc_auc_multiclass>`; |
| * In the multilabel case, it corresponds to an array of shape |
| `(n_samples, n_classes)`. Probability estimates are provided by the |
| `predict_proba` method and the non-thresholded decision values by |
| the `decision_function` method. The probability estimates correspond |
| to the **probability of the class with the greater label for each |
| output** of the classifier. See more information in the |
| :ref:`User guide <roc_auc_multilabel>`. |
| |
| average : {'micro', 'macro', 'samples', 'weighted'} or None, \ |
| default='macro' |
| If ``None``, the scores for each class are returned. |
| Otherwise, this determines the type of averaging performed on the data. |
| Note: multiclass ROC AUC currently only handles the 'macro' and |
| 'weighted' averages. For multiclass targets, `average=None` is only |
| implemented for `multi_class='ovr'` and `average='micro'` is only |
| implemented for `multi_class='ovr'`. |
| |
| ``'micro'``: |
| Calculate metrics globally by considering each element of the label |
| indicator matrix as a label. |
| ``'macro'``: |
| Calculate metrics for each label, and find their unweighted |
| mean. This does not take label imbalance into account. |
| ``'weighted'``: |
| Calculate metrics for each label, and find their average, weighted |
| by support (the number of true instances for each label). |
| ``'samples'``: |
| Calculate metrics for each instance, and find their average. |
| |
| Will be ignored when ``y_true`` is binary. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| max_fpr : float > 0 and <= 1, default=None |
| If not ``None``, the standardized partial AUC [2]_ over the range |
| [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, |
| should be either equal to ``None`` or ``1.0`` as AUC ROC partial |
| computation currently is not supported for multiclass. |
| |
| multi_class : {'raise', 'ovr', 'ovo'}, default='raise' |
| Only used for multiclass targets. Determines the type of configuration |
| to use. The default value raises an error, so either |
| ``'ovr'`` or ``'ovo'`` must be passed explicitly. |
| |
| ``'ovr'``: |
| Stands for One-vs-rest. Computes the AUC of each class |
| against the rest [3]_ [4]_. This |
| treats the multiclass case in the same way as the multilabel case. |
| Sensitive to class imbalance even when ``average == 'macro'``, |
| because class imbalance affects the composition of each of the |
| 'rest' groupings. |
| ``'ovo'``: |
| Stands for One-vs-one. Computes the average AUC of all |
| possible pairwise combinations of classes [5]_. |
| Insensitive to class imbalance when |
| ``average == 'macro'``. |
| |
| labels : array-like of shape (n_classes,), default=None |
| Only used for multiclass targets. List of labels that index the |
| classes in ``y_score``. If ``None``, the numerical or lexicographical |
| order of the labels in ``y_true`` is used. |
| |
| Returns |
| ------- |
| auc : float |
| Area Under the Curve score. |
| |
| See Also |
| -------- |
| average_precision_score : Area under the precision-recall curve. |
| roc_curve : Compute Receiver operating characteristic (ROC) curve. |
| RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic |
| (ROC) curve given an estimator and some data. |
| RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic |
| (ROC) curve given the true and predicted values. |
| |
| Notes |
| ----- |
| The Gini Coefficient is a summary measure of the ranking ability of binary |
| classifiers. It is expressed using the area under of the ROC as follows: |
| |
| G = 2 * AUC - 1 |
| |
| Where G is the Gini coefficient and AUC is the ROC-AUC score. This normalisation |
| will ensure that random guessing will yield a score of 0 in expectation, and it is |
| upper bounded by 1. |
| |
| References |
| ---------- |
| .. [1] `Wikipedia entry for the Receiver operating characteristic |
| <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ |
| |
| .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 |
| <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ |
| |
| .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving |
| probability estimation trees (Section 6.2), CeDER Working Paper |
| #IS-00-04, Stern School of Business, New York University. |
| |
| .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern |
| Recognition Letters, 27(8), 861-874. |
| <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ |
| |
| .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area |
| Under the ROC Curve for Multiple Class Classification Problems. |
| Machine Learning, 45(2), 171-186. |
| <http://link.springer.com/article/10.1023/A:1010920819831>`_ |
| .. [6] `Wikipedia entry for the Gini coefficient |
| <https://en.wikipedia.org/wiki/Gini_coefficient>`_ |
| |
| Examples |
| -------- |
| Binary case: |
| |
| >>> from sklearn.datasets import load_breast_cancer |
| >>> from sklearn.linear_model import LogisticRegression |
| >>> from sklearn.metrics import roc_auc_score |
| >>> X, y = load_breast_cancer(return_X_y=True) |
| >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) |
| >>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) |
| np.float64(0.99...) |
| >>> roc_auc_score(y, clf.decision_function(X)) |
| np.float64(0.99...) |
| |
| Multiclass case: |
| |
| >>> from sklearn.datasets import load_iris |
| >>> X, y = load_iris(return_X_y=True) |
| >>> clf = LogisticRegression(solver="liblinear").fit(X, y) |
| >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') |
| np.float64(0.99...) |
| |
| Multilabel case: |
| |
| >>> import numpy as np |
| >>> from sklearn.datasets import make_multilabel_classification |
| >>> from sklearn.multioutput import MultiOutputClassifier |
| >>> X, y = make_multilabel_classification(random_state=0) |
| >>> clf = MultiOutputClassifier(clf).fit(X, y) |
| >>> # get a list of n_output containing probability arrays of shape |
| >>> # (n_samples, n_classes) |
| >>> y_pred = clf.predict_proba(X) |
| >>> # extract the positive columns for each output |
| >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred]) |
| >>> roc_auc_score(y, y_pred, average=None) |
| array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) |
| >>> from sklearn.linear_model import RidgeClassifierCV |
| >>> clf = RidgeClassifierCV().fit(X, y) |
| >>> roc_auc_score(y, clf.decision_function(X), average=None) |
| array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) |
| """ |
|
|
| y_type = type_of_target(y_true, input_name="y_true") |
| y_true = check_array(y_true, ensure_2d=False, dtype=None) |
| y_score = check_array(y_score, ensure_2d=False) |
|
|
| if y_type == "multiclass" or ( |
| y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 |
| ): |
| |
| if max_fpr is not None and max_fpr != 1.0: |
| raise ValueError( |
| "Partial AUC computation not available in " |
| "multiclass setting, 'max_fpr' must be" |
| " set to `None`, received `max_fpr={0}` " |
| "instead".format(max_fpr) |
| ) |
| if multi_class == "raise": |
| raise ValueError("multi_class must be in ('ovo', 'ovr')") |
| return _multiclass_roc_auc_score( |
| y_true, y_score, labels, multi_class, average, sample_weight |
| ) |
| elif y_type == "binary": |
| labels = np.unique(y_true) |
| y_true = label_binarize(y_true, classes=labels)[:, 0] |
| return _average_binary_score( |
| partial(_binary_roc_auc_score, max_fpr=max_fpr), |
| y_true, |
| y_score, |
| average, |
| sample_weight=sample_weight, |
| ) |
| else: |
| return _average_binary_score( |
| partial(_binary_roc_auc_score, max_fpr=max_fpr), |
| y_true, |
| y_score, |
| average, |
| sample_weight=sample_weight, |
| ) |
|
|
|
|
| def _multiclass_roc_auc_score( |
| y_true, y_score, labels, multi_class, average, sample_weight |
| ): |
| """Multiclass roc auc score. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) |
| True multiclass labels. |
| |
| y_score : array-like of shape (n_samples, n_classes) |
| Target scores corresponding to probability estimates of a sample |
| belonging to a particular class |
| |
| labels : array-like of shape (n_classes,) or None |
| List of labels to index ``y_score`` used for multiclass. If ``None``, |
| the lexical order of ``y_true`` is used to index ``y_score``. |
| |
| multi_class : {'ovr', 'ovo'} |
| Determines the type of multiclass configuration to use. |
| ``'ovr'``: |
| Calculate metrics for the multiclass case using the one-vs-rest |
| approach. |
| ``'ovo'``: |
| Calculate metrics for the multiclass case using the one-vs-one |
| approach. |
| |
| average : {'micro', 'macro', 'weighted'} |
| Determines the type of averaging performed on the pairwise binary |
| metric scores |
| ``'micro'``: |
| Calculate metrics for the binarized-raveled classes. Only supported |
| for `multi_class='ovr'`. |
| |
| .. versionadded:: 1.2 |
| |
| ``'macro'``: |
| Calculate metrics for each label, and find their unweighted |
| mean. This does not take label imbalance into account. Classes |
| are assumed to be uniformly distributed. |
| ``'weighted'``: |
| Calculate metrics for each label, taking into account the |
| prevalence of the classes. |
| |
| sample_weight : array-like of shape (n_samples,) or None |
| Sample weights. |
| |
| """ |
| |
| if not np.allclose(1, y_score.sum(axis=1)): |
| raise ValueError( |
| "Target scores need to be probabilities for multiclass " |
| "roc_auc, i.e. they should sum up to 1.0 over classes" |
| ) |
|
|
| |
| average_options = ("macro", "weighted", None) |
| if multi_class == "ovr": |
| average_options = ("micro",) + average_options |
| if average not in average_options: |
| raise ValueError( |
| "average must be one of {0} for multiclass problems".format(average_options) |
| ) |
|
|
| multiclass_options = ("ovo", "ovr") |
| if multi_class not in multiclass_options: |
| raise ValueError( |
| "multi_class='{0}' is not supported " |
| "for multiclass ROC AUC, multi_class must be " |
| "in {1}".format(multi_class, multiclass_options) |
| ) |
|
|
| if average is None and multi_class == "ovo": |
| raise NotImplementedError( |
| "average=None is not implemented for multi_class='ovo'." |
| ) |
|
|
| if labels is not None: |
| labels = column_or_1d(labels) |
| classes = _unique(labels) |
| if len(classes) != len(labels): |
| raise ValueError("Parameter 'labels' must be unique") |
| if not np.array_equal(classes, labels): |
| raise ValueError("Parameter 'labels' must be ordered") |
| if len(classes) != y_score.shape[1]: |
| raise ValueError( |
| "Number of given labels, {0}, not equal to the number " |
| "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1]) |
| ) |
| if len(np.setdiff1d(y_true, classes)): |
| raise ValueError("'y_true' contains labels not in parameter 'labels'") |
| else: |
| classes = _unique(y_true) |
| if len(classes) != y_score.shape[1]: |
| raise ValueError( |
| "Number of classes in y_true not equal to the number of " |
| "columns in 'y_score'" |
| ) |
|
|
| if multi_class == "ovo": |
| if sample_weight is not None: |
| raise ValueError( |
| "sample_weight is not supported " |
| "for multiclass one-vs-one ROC AUC, " |
| "'sample_weight' must be None in this case." |
| ) |
| y_true_encoded = _encode(y_true, uniques=classes) |
| |
| return _average_multiclass_ovo_score( |
| _binary_roc_auc_score, y_true_encoded, y_score, average=average |
| ) |
| else: |
| |
| y_true_multilabel = label_binarize(y_true, classes=classes) |
| return _average_binary_score( |
| _binary_roc_auc_score, |
| y_true_multilabel, |
| y_score, |
| average, |
| sample_weight=sample_weight, |
| ) |
|
|
|
|
| def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): |
| """Calculate true and false positives per binary classification threshold. |
| |
| Parameters |
| ---------- |
| y_true : ndarray of shape (n_samples,) |
| True targets of binary classification. |
| |
| y_score : ndarray of shape (n_samples,) |
| Estimated probabilities or output of a decision function. |
| |
| pos_label : int, float, bool or str, default=None |
| The label of the positive class. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| Returns |
| ------- |
| fps : ndarray of shape (n_thresholds,) |
| A count of false positives, at index i being the number of negative |
| samples assigned a score >= thresholds[i]. The total number of |
| negative samples is equal to fps[-1] (thus true negatives are given by |
| fps[-1] - fps). |
| |
| tps : ndarray of shape (n_thresholds,) |
| An increasing count of true positives, at index i being the number |
| of positive samples assigned a score >= thresholds[i]. The total |
| number of positive samples is equal to tps[-1] (thus false negatives |
| are given by tps[-1] - tps). |
| |
| thresholds : ndarray of shape (n_thresholds,) |
| Decreasing score values. |
| """ |
| |
| y_type = type_of_target(y_true, input_name="y_true") |
| if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): |
| raise ValueError("{0} format is not supported".format(y_type)) |
|
|
| check_consistent_length(y_true, y_score, sample_weight) |
| y_true = column_or_1d(y_true) |
| y_score = column_or_1d(y_score) |
| assert_all_finite(y_true) |
| assert_all_finite(y_score) |
|
|
| |
| if sample_weight is not None: |
| sample_weight = column_or_1d(sample_weight) |
| sample_weight = _check_sample_weight(sample_weight, y_true) |
| nonzero_weight_mask = sample_weight != 0 |
| y_true = y_true[nonzero_weight_mask] |
| y_score = y_score[nonzero_weight_mask] |
| sample_weight = sample_weight[nonzero_weight_mask] |
|
|
| pos_label = _check_pos_label_consistency(pos_label, y_true) |
|
|
| |
| y_true = y_true == pos_label |
|
|
| |
| desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] |
| y_score = y_score[desc_score_indices] |
| y_true = y_true[desc_score_indices] |
| if sample_weight is not None: |
| weight = sample_weight[desc_score_indices] |
| else: |
| weight = 1.0 |
|
|
| |
| |
| |
| distinct_value_indices = np.where(np.diff(y_score))[0] |
| threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] |
|
|
| |
| tps = stable_cumsum(y_true * weight)[threshold_idxs] |
| if sample_weight is not None: |
| |
| |
| fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] |
| else: |
| fps = 1 + threshold_idxs - tps |
| return fps, tps, y_score[threshold_idxs] |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like", Hidden(None)], |
| "pos_label": [Real, str, "boolean", None], |
| "sample_weight": ["array-like", None], |
| "drop_intermediate": ["boolean"], |
| "probas_pred": [ |
| "array-like", |
| Hidden(StrOptions({"deprecated"})), |
| ], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def precision_recall_curve( |
| y_true, |
| y_score=None, |
| *, |
| pos_label=None, |
| sample_weight=None, |
| drop_intermediate=False, |
| probas_pred="deprecated", |
| ): |
| """Compute precision-recall pairs for different probability thresholds. |
| |
| Note: this implementation is restricted to the binary classification task. |
| |
| The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of |
| true positives and ``fp`` the number of false positives. The precision is |
| intuitively the ability of the classifier not to label as positive a sample |
| that is negative. |
| |
| The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of |
| true positives and ``fn`` the number of false negatives. The recall is |
| intuitively the ability of the classifier to find all the positive samples. |
| |
| The last precision and recall values are 1. and 0. respectively and do not |
| have a corresponding threshold. This ensures that the graph starts on the |
| y axis. |
| |
| The first precision and recall values are precision=class balance and recall=1.0 |
| which corresponds to a classifier that always predicts the positive class. |
| |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) |
| True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
| pos_label should be explicitly given. |
| |
| y_score : array-like of shape (n_samples,) |
| Target scores, can either be probability estimates of the positive |
| class, or non-thresholded measure of decisions (as returned by |
| `decision_function` on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| pos_label : int, float, bool or str, default=None |
| The label of the positive class. |
| When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, |
| ``pos_label`` is set to 1, otherwise an error will be raised. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| drop_intermediate : bool, default=False |
| Whether to drop some suboptimal thresholds which would not appear |
| on a plotted precision-recall curve. This is useful in order to create |
| lighter precision-recall curves. |
| |
| .. versionadded:: 1.3 |
| |
| probas_pred : array-like of shape (n_samples,) |
| Target scores, can either be probability estimates of the positive |
| class, or non-thresholded measure of decisions (as returned by |
| `decision_function` on some classifiers). |
| |
| .. deprecated:: 1.5 |
| `probas_pred` is deprecated and will be removed in 1.7. Use |
| `y_score` instead. |
| |
| Returns |
| ------- |
| precision : ndarray of shape (n_thresholds + 1,) |
| Precision values such that element i is the precision of |
| predictions with score >= thresholds[i] and the last element is 1. |
| |
| recall : ndarray of shape (n_thresholds + 1,) |
| Decreasing recall values such that element i is the recall of |
| predictions with score >= thresholds[i] and the last element is 0. |
| |
| thresholds : ndarray of shape (n_thresholds,) |
| Increasing thresholds on the decision function used to compute |
| precision and recall where `n_thresholds = len(np.unique(probas_pred))`. |
| |
| See Also |
| -------- |
| PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given |
| a binary classifier. |
| PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve |
| using predictions from a binary classifier. |
| average_precision_score : Compute average precision from prediction scores. |
| det_curve: Compute error rates for different probability thresholds. |
| roc_curve : Compute Receiver operating characteristic (ROC) curve. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import precision_recall_curve |
| >>> y_true = np.array([0, 0, 1, 1]) |
| >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
| >>> precision, recall, thresholds = precision_recall_curve( |
| ... y_true, y_scores) |
| >>> precision |
| array([0.5 , 0.66666667, 0.5 , 1. , 1. ]) |
| >>> recall |
| array([1. , 1. , 0.5, 0.5, 0. ]) |
| >>> thresholds |
| array([0.1 , 0.35, 0.4 , 0.8 ]) |
| """ |
| |
| |
| |
| if y_score is not None and not isinstance(probas_pred, str): |
| raise ValueError( |
| "`probas_pred` and `y_score` cannot be both specified. Please use `y_score`" |
| " only as `probas_pred` is deprecated in v1.5 and will be removed in v1.7." |
| ) |
| if y_score is None: |
| warnings.warn( |
| ( |
| "probas_pred was deprecated in version 1.5 and will be removed in 1.7." |
| "Please use ``y_score`` instead." |
| ), |
| FutureWarning, |
| ) |
| y_score = probas_pred |
|
|
| fps, tps, thresholds = _binary_clf_curve( |
| y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
| ) |
|
|
| if drop_intermediate and len(fps) > 2: |
| |
| |
| |
| |
| |
| optimal_idxs = np.where( |
| np.concatenate( |
| [[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]] |
| ) |
| )[0] |
| fps = fps[optimal_idxs] |
| tps = tps[optimal_idxs] |
| thresholds = thresholds[optimal_idxs] |
|
|
| ps = tps + fps |
| |
| |
| precision = np.zeros_like(tps) |
| np.divide(tps, ps, out=precision, where=(ps != 0)) |
|
|
| |
| |
| if tps[-1] == 0: |
| warnings.warn( |
| "No positive class found in y_true, " |
| "recall is set to one for all thresholds." |
| ) |
| recall = np.ones_like(tps) |
| else: |
| recall = tps / tps[-1] |
|
|
| |
| sl = slice(None, None, -1) |
| return np.hstack((precision[sl], 1)), np.hstack((recall[sl], 0)), thresholds[sl] |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "pos_label": [Real, str, "boolean", None], |
| "sample_weight": ["array-like", None], |
| "drop_intermediate": ["boolean"], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def roc_curve( |
| y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True |
| ): |
| """Compute Receiver operating characteristic (ROC). |
| |
| Note: this implementation is restricted to the binary classification task. |
| |
| Read more in the :ref:`User Guide <roc_metrics>`. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) |
| True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
| pos_label should be explicitly given. |
| |
| y_score : array-like of shape (n_samples,) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by "decision_function" on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| pos_label : int, float, bool or str, default=None |
| The label of the positive class. |
| When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, |
| ``pos_label`` is set to 1, otherwise an error will be raised. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| drop_intermediate : bool, default=True |
| Whether to drop some suboptimal thresholds which would not appear |
| on a plotted ROC curve. This is useful in order to create lighter |
| ROC curves. |
| |
| .. versionadded:: 0.17 |
| parameter *drop_intermediate*. |
| |
| Returns |
| ------- |
| fpr : ndarray of shape (>2,) |
| Increasing false positive rates such that element i is the false |
| positive rate of predictions with score >= `thresholds[i]`. |
| |
| tpr : ndarray of shape (>2,) |
| Increasing true positive rates such that element `i` is the true |
| positive rate of predictions with score >= `thresholds[i]`. |
| |
| thresholds : ndarray of shape (n_thresholds,) |
| Decreasing thresholds on the decision function used to compute |
| fpr and tpr. `thresholds[0]` represents no instances being predicted |
| and is arbitrarily set to `np.inf`. |
| |
| See Also |
| -------- |
| RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic |
| (ROC) curve given an estimator and some data. |
| RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic |
| (ROC) curve given the true and predicted values. |
| det_curve: Compute error rates for different probability thresholds. |
| roc_auc_score : Compute the area under the ROC curve. |
| |
| Notes |
| ----- |
| Since the thresholds are sorted from low to high values, they |
| are reversed upon returning them to ensure they correspond to both ``fpr`` |
| and ``tpr``, which are sorted in reversed order during their calculation. |
| |
| An arbitrary threshold is added for the case `tpr=0` and `fpr=0` to |
| ensure that the curve starts at `(0, 0)`. This threshold corresponds to the |
| `np.inf`. |
| |
| References |
| ---------- |
| .. [1] `Wikipedia entry for the Receiver operating characteristic |
| <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ |
| |
| .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition |
| Letters, 2006, 27(8):861-874. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn import metrics |
| >>> y = np.array([1, 1, 2, 2]) |
| >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) |
| >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) |
| >>> fpr |
| array([0. , 0. , 0.5, 0.5, 1. ]) |
| >>> tpr |
| array([0. , 0.5, 0.5, 1. , 1. ]) |
| >>> thresholds |
| array([ inf, 0.8 , 0.4 , 0.35, 0.1 ]) |
| """ |
| fps, tps, thresholds = _binary_clf_curve( |
| y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if drop_intermediate and len(fps) > 2: |
| optimal_idxs = np.where( |
| np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True] |
| )[0] |
| fps = fps[optimal_idxs] |
| tps = tps[optimal_idxs] |
| thresholds = thresholds[optimal_idxs] |
|
|
| |
| |
| tps = np.r_[0, tps] |
| fps = np.r_[0, fps] |
| |
| thresholds = np.r_[np.inf, thresholds] |
|
|
| if fps[-1] <= 0: |
| warnings.warn( |
| "No negative samples in y_true, false positive value should be meaningless", |
| UndefinedMetricWarning, |
| ) |
| fpr = np.repeat(np.nan, fps.shape) |
| else: |
| fpr = fps / fps[-1] |
|
|
| if tps[-1] <= 0: |
| warnings.warn( |
| "No positive samples in y_true, true positive value should be meaningless", |
| UndefinedMetricWarning, |
| ) |
| tpr = np.repeat(np.nan, tps.shape) |
| else: |
| tpr = tps / tps[-1] |
|
|
| return fpr, tpr, thresholds |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like", "sparse matrix"], |
| "y_score": ["array-like"], |
| "sample_weight": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): |
| """Compute ranking-based average precision. |
| |
| Label ranking average precision (LRAP) is the average over each ground |
| truth label assigned to each sample, of the ratio of true vs. total |
| labels with lower score. |
| |
| This metric is used in multilabel ranking problem, where the goal |
| is to give better rank to the labels associated to each sample. |
| |
| The obtained score is always strictly greater than 0 and |
| the best value is 1. |
| |
| Read more in the :ref:`User Guide <label_ranking_average_precision>`. |
| |
| Parameters |
| ---------- |
| y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) |
| True binary labels in binary indicator format. |
| |
| y_score : array-like of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by "decision_function" on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| .. versionadded:: 0.20 |
| |
| Returns |
| ------- |
| score : float |
| Ranking-based average precision score. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import label_ranking_average_precision_score |
| >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) |
| >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) |
| >>> label_ranking_average_precision_score(y_true, y_score) |
| np.float64(0.416...) |
| """ |
| check_consistent_length(y_true, y_score, sample_weight) |
| y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") |
| y_score = check_array(y_score, ensure_2d=False) |
|
|
| if y_true.shape != y_score.shape: |
| raise ValueError("y_true and y_score have different shape") |
|
|
| |
| y_type = type_of_target(y_true, input_name="y_true") |
| if y_type != "multilabel-indicator" and not ( |
| y_type == "binary" and y_true.ndim == 2 |
| ): |
| raise ValueError("{0} format is not supported".format(y_type)) |
|
|
| if not issparse(y_true): |
| y_true = csr_matrix(y_true) |
|
|
| y_score = -y_score |
|
|
| n_samples, n_labels = y_true.shape |
|
|
| out = 0.0 |
| for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): |
| relevant = y_true.indices[start:stop] |
|
|
| if relevant.size == 0 or relevant.size == n_labels: |
| |
| |
| aux = 1.0 |
| else: |
| scores_i = y_score[i] |
| rank = rankdata(scores_i, "max")[relevant] |
| L = rankdata(scores_i[relevant], "max") |
| aux = (L / rank).mean() |
|
|
| if sample_weight is not None: |
| aux = aux * sample_weight[i] |
| out += aux |
|
|
| if sample_weight is None: |
| out /= n_samples |
| else: |
| out /= np.sum(sample_weight) |
|
|
| return out |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "sample_weight": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def coverage_error(y_true, y_score, *, sample_weight=None): |
| """Coverage error measure. |
| |
| Compute how far we need to go through the ranked scores to cover all |
| true labels. The best value is equal to the average number |
| of labels in ``y_true`` per sample. |
| |
| Ties in ``y_scores`` are broken by giving maximal rank that would have |
| been assigned to all tied values. |
| |
| Note: Our implementation's score is 1 greater than the one given in |
| Tsoumakas et al., 2010. This extends it to handle the degenerate case |
| in which an instance has 0 true labels. |
| |
| Read more in the :ref:`User Guide <coverage_error>`. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples, n_labels) |
| True binary labels in binary indicator format. |
| |
| y_score : array-like of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by "decision_function" on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| Returns |
| ------- |
| coverage_error : float |
| The coverage error. |
| |
| References |
| ---------- |
| .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). |
| Mining multi-label data. In Data mining and knowledge discovery |
| handbook (pp. 667-685). Springer US. |
| |
| Examples |
| -------- |
| >>> from sklearn.metrics import coverage_error |
| >>> y_true = [[1, 0, 0], [0, 1, 1]] |
| >>> y_score = [[1, 0, 0], [0, 1, 1]] |
| >>> coverage_error(y_true, y_score) |
| np.float64(1.5) |
| """ |
| y_true = check_array(y_true, ensure_2d=True) |
| y_score = check_array(y_score, ensure_2d=True) |
| check_consistent_length(y_true, y_score, sample_weight) |
|
|
| y_type = type_of_target(y_true, input_name="y_true") |
| if y_type != "multilabel-indicator": |
| raise ValueError("{0} format is not supported".format(y_type)) |
|
|
| if y_true.shape != y_score.shape: |
| raise ValueError("y_true and y_score have different shape") |
|
|
| y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) |
| y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) |
| coverage = (y_score >= y_min_relevant).sum(axis=1) |
| coverage = coverage.filled(0) |
|
|
| return np.average(coverage, weights=sample_weight) |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like", "sparse matrix"], |
| "y_score": ["array-like"], |
| "sample_weight": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def label_ranking_loss(y_true, y_score, *, sample_weight=None): |
| """Compute Ranking loss measure. |
| |
| Compute the average number of label pairs that are incorrectly ordered |
| given y_score weighted by the size of the label set and the number of |
| labels not in the label set. |
| |
| This is similar to the error set size, but weighted by the number of |
| relevant and irrelevant labels. The best performance is achieved with |
| a ranking loss of zero. |
| |
| Read more in the :ref:`User Guide <label_ranking_loss>`. |
| |
| .. versionadded:: 0.17 |
| A function *label_ranking_loss* |
| |
| Parameters |
| ---------- |
| y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) |
| True binary labels in binary indicator format. |
| |
| y_score : array-like of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates of the positive |
| class, confidence values, or non-thresholded measure of decisions |
| (as returned by "decision_function" on some classifiers). |
| For :term:`decision_function` scores, values greater than or equal to |
| zero should indicate the positive class. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. |
| |
| Returns |
| ------- |
| loss : float |
| Average number of label pairs that are incorrectly ordered given |
| y_score weighted by the size of the label set and the number of labels not |
| in the label set. |
| |
| References |
| ---------- |
| .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). |
| Mining multi-label data. In Data mining and knowledge discovery |
| handbook (pp. 667-685). Springer US. |
| |
| Examples |
| -------- |
| >>> from sklearn.metrics import label_ranking_loss |
| >>> y_true = [[1, 0, 0], [0, 0, 1]] |
| >>> y_score = [[0.75, 0.5, 1], [1, 0.2, 0.1]] |
| >>> label_ranking_loss(y_true, y_score) |
| np.float64(0.75...) |
| """ |
| y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") |
| y_score = check_array(y_score, ensure_2d=False) |
| check_consistent_length(y_true, y_score, sample_weight) |
|
|
| y_type = type_of_target(y_true, input_name="y_true") |
| if y_type not in ("multilabel-indicator",): |
| raise ValueError("{0} format is not supported".format(y_type)) |
|
|
| if y_true.shape != y_score.shape: |
| raise ValueError("y_true and y_score have different shape") |
|
|
| n_samples, n_labels = y_true.shape |
|
|
| y_true = csr_matrix(y_true) |
|
|
| loss = np.zeros(n_samples) |
| for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): |
| |
| unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True) |
| true_at_reversed_rank = np.bincount( |
| unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores) |
| ) |
| all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores)) |
| false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank |
|
|
| |
| |
| |
| |
| loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank) |
|
|
| n_positives = count_nonzero(y_true, axis=1) |
| with np.errstate(divide="ignore", invalid="ignore"): |
| loss /= (n_labels - n_positives) * n_positives |
|
|
| |
| |
| loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.0 |
|
|
| return np.average(loss, weights=sample_weight) |
|
|
|
|
| def _dcg_sample_scores(y_true, y_score, k=None, log_base=2, ignore_ties=False): |
| """Compute Discounted Cumulative Gain. |
| |
| Sum the true scores ranked in the order induced by the predicted scores, |
| after applying a logarithmic discount. |
| |
| This ranking metric yields a high value if true labels are ranked high by |
| ``y_score``. |
| |
| Parameters |
| ---------- |
| y_true : ndarray of shape (n_samples, n_labels) |
| True targets of multilabel classification, or true scores of entities |
| to be ranked. |
| |
| y_score : ndarray of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates, confidence values, |
| or non-thresholded measure of decisions (as returned by |
| "decision_function" on some classifiers). |
| |
| k : int, default=None |
| Only consider the highest k scores in the ranking. If `None`, use all |
| outputs. |
| |
| log_base : float, default=2 |
| Base of the logarithm used for the discount. A low value means a |
| sharper discount (top results are more important). |
| |
| ignore_ties : bool, default=False |
| Assume that there are no ties in y_score (which is likely to be the |
| case if y_score is continuous) for efficiency gains. |
| |
| Returns |
| ------- |
| discounted_cumulative_gain : ndarray of shape (n_samples,) |
| The DCG score for each sample. |
| |
| See Also |
| -------- |
| ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted |
| Cumulative Gain (the DCG obtained for a perfect ranking), in order to |
| have a score between 0 and 1. |
| """ |
| discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base)) |
| if k is not None: |
| discount[k:] = 0 |
| if ignore_ties: |
| ranking = np.argsort(y_score)[:, ::-1] |
| ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking] |
| cumulative_gains = discount.dot(ranked.T) |
| else: |
| discount_cumsum = np.cumsum(discount) |
| cumulative_gains = [ |
| _tie_averaged_dcg(y_t, y_s, discount_cumsum) |
| for y_t, y_s in zip(y_true, y_score) |
| ] |
| cumulative_gains = np.asarray(cumulative_gains) |
| return cumulative_gains |
|
|
|
|
| def _tie_averaged_dcg(y_true, y_score, discount_cumsum): |
| """ |
| Compute DCG by averaging over possible permutations of ties. |
| |
| The gain (`y_true`) of an index falling inside a tied group (in the order |
| induced by `y_score`) is replaced by the average gain within this group. |
| The discounted gain for a tied group is then the average `y_true` within |
| this group times the sum of discounts of the corresponding ranks. |
| |
| This amounts to averaging scores for all possible orderings of the tied |
| groups. |
| |
| (note in the case of dcg@k the discount is 0 after index k) |
| |
| Parameters |
| ---------- |
| y_true : ndarray |
| The true relevance scores. |
| |
| y_score : ndarray |
| Predicted scores. |
| |
| discount_cumsum : ndarray |
| Precomputed cumulative sum of the discounts. |
| |
| Returns |
| ------- |
| discounted_cumulative_gain : float |
| The discounted cumulative gain. |
| |
| References |
| ---------- |
| McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
| performance measures efficiently in the presence of tied scores. In |
| European conference on information retrieval (pp. 414-421). Springer, |
| Berlin, Heidelberg. |
| """ |
| _, inv, counts = np.unique(-y_score, return_inverse=True, return_counts=True) |
| ranked = np.zeros(len(counts)) |
| np.add.at(ranked, inv, y_true) |
| ranked /= counts |
| groups = np.cumsum(counts) - 1 |
| discount_sums = np.empty(len(counts)) |
| discount_sums[0] = discount_cumsum[groups[0]] |
| discount_sums[1:] = np.diff(discount_cumsum[groups]) |
| return (ranked * discount_sums).sum() |
|
|
|
|
| def _check_dcg_target_type(y_true): |
| y_type = type_of_target(y_true, input_name="y_true") |
| supported_fmt = ( |
| "multilabel-indicator", |
| "continuous-multioutput", |
| "multiclass-multioutput", |
| ) |
| if y_type not in supported_fmt: |
| raise ValueError( |
| "Only {} formats are supported. Got {} instead".format( |
| supported_fmt, y_type |
| ) |
| ) |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "k": [Interval(Integral, 1, None, closed="left"), None], |
| "log_base": [Interval(Real, 0.0, None, closed="neither")], |
| "sample_weight": ["array-like", None], |
| "ignore_ties": ["boolean"], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def dcg_score( |
| y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False |
| ): |
| """Compute Discounted Cumulative Gain. |
| |
| Sum the true scores ranked in the order induced by the predicted scores, |
| after applying a logarithmic discount. |
| |
| This ranking metric yields a high value if true labels are ranked high by |
| ``y_score``. |
| |
| Usually the Normalized Discounted Cumulative Gain (NDCG, computed by |
| ndcg_score) is preferred. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples, n_labels) |
| True targets of multilabel classification, or true scores of entities |
| to be ranked. |
| |
| y_score : array-like of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates, confidence values, |
| or non-thresholded measure of decisions (as returned by |
| "decision_function" on some classifiers). |
| |
| k : int, default=None |
| Only consider the highest k scores in the ranking. If None, use all |
| outputs. |
| |
| log_base : float, default=2 |
| Base of the logarithm used for the discount. A low value means a |
| sharper discount (top results are more important). |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If `None`, all samples are given the same weight. |
| |
| ignore_ties : bool, default=False |
| Assume that there are no ties in y_score (which is likely to be the |
| case if y_score is continuous) for efficiency gains. |
| |
| Returns |
| ------- |
| discounted_cumulative_gain : float |
| The averaged sample DCG scores. |
| |
| See Also |
| -------- |
| ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted |
| Cumulative Gain (the DCG obtained for a perfect ranking), in order to |
| have a score between 0 and 1. |
| |
| References |
| ---------- |
| `Wikipedia entry for Discounted Cumulative Gain |
| <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_. |
| |
| Jarvelin, K., & Kekalainen, J. (2002). |
| Cumulated gain-based evaluation of IR techniques. ACM Transactions on |
| Information Systems (TOIS), 20(4), 422-446. |
| |
| Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). |
| A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th |
| Annual Conference on Learning Theory (COLT 2013). |
| |
| McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
| performance measures efficiently in the presence of tied scores. In |
| European conference on information retrieval (pp. 414-421). Springer, |
| Berlin, Heidelberg. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import dcg_score |
| >>> # we have ground-truth relevance of some answers to a query: |
| >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) |
| >>> # we predict scores for the answers |
| >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) |
| >>> dcg_score(true_relevance, scores) |
| np.float64(9.49...) |
| >>> # we can set k to truncate the sum; only top k answers contribute |
| >>> dcg_score(true_relevance, scores, k=2) |
| np.float64(5.63...) |
| >>> # now we have some ties in our prediction |
| >>> scores = np.asarray([[1, 0, 0, 0, 1]]) |
| >>> # by default ties are averaged, so here we get the average true |
| >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5 |
| >>> dcg_score(true_relevance, scores, k=1) |
| np.float64(7.5) |
| >>> # we can choose to ignore ties for faster results, but only |
| >>> # if we know there aren't ties in our scores, otherwise we get |
| >>> # wrong results: |
| >>> dcg_score(true_relevance, |
| ... scores, k=1, ignore_ties=True) |
| np.float64(5.0) |
| """ |
| y_true = check_array(y_true, ensure_2d=False) |
| y_score = check_array(y_score, ensure_2d=False) |
| check_consistent_length(y_true, y_score, sample_weight) |
| _check_dcg_target_type(y_true) |
| return np.average( |
| _dcg_sample_scores( |
| y_true, y_score, k=k, log_base=log_base, ignore_ties=ignore_ties |
| ), |
| weights=sample_weight, |
| ) |
|
|
|
|
| def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): |
| """Compute Normalized Discounted Cumulative Gain. |
| |
| Sum the true scores ranked in the order induced by the predicted scores, |
| after applying a logarithmic discount. Then divide by the best possible |
| score (Ideal DCG, obtained for a perfect ranking) to obtain a score between |
| 0 and 1. |
| |
| This ranking metric yields a high value if true labels are ranked high by |
| ``y_score``. |
| |
| Parameters |
| ---------- |
| y_true : ndarray of shape (n_samples, n_labels) |
| True targets of multilabel classification, or true scores of entities |
| to be ranked. |
| |
| y_score : ndarray of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates, confidence values, |
| or non-thresholded measure of decisions (as returned by |
| "decision_function" on some classifiers). |
| |
| k : int, default=None |
| Only consider the highest k scores in the ranking. If None, use all |
| outputs. |
| |
| ignore_ties : bool, default=False |
| Assume that there are no ties in y_score (which is likely to be the |
| case if y_score is continuous) for efficiency gains. |
| |
| Returns |
| ------- |
| normalized_discounted_cumulative_gain : ndarray of shape (n_samples,) |
| The NDCG score for each sample (float in [0., 1.]). |
| |
| See Also |
| -------- |
| dcg_score : Discounted Cumulative Gain (not normalized). |
| |
| """ |
| gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties) |
| |
| |
| |
| normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True) |
| all_irrelevant = normalizing_gain == 0 |
| gain[all_irrelevant] = 0 |
| gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant] |
| return gain |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "k": [Interval(Integral, 1, None, closed="left"), None], |
| "sample_weight": ["array-like", None], |
| "ignore_ties": ["boolean"], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False): |
| """Compute Normalized Discounted Cumulative Gain. |
| |
| Sum the true scores ranked in the order induced by the predicted scores, |
| after applying a logarithmic discount. Then divide by the best possible |
| score (Ideal DCG, obtained for a perfect ranking) to obtain a score between |
| 0 and 1. |
| |
| This ranking metric returns a high value if true labels are ranked high by |
| ``y_score``. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples, n_labels) |
| True targets of multilabel classification, or true scores of entities |
| to be ranked. Negative values in `y_true` may result in an output |
| that is not between 0 and 1. |
| |
| y_score : array-like of shape (n_samples, n_labels) |
| Target scores, can either be probability estimates, confidence values, |
| or non-thresholded measure of decisions (as returned by |
| "decision_function" on some classifiers). |
| |
| k : int, default=None |
| Only consider the highest k scores in the ranking. If `None`, use all |
| outputs. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If `None`, all samples are given the same weight. |
| |
| ignore_ties : bool, default=False |
| Assume that there are no ties in y_score (which is likely to be the |
| case if y_score is continuous) for efficiency gains. |
| |
| Returns |
| ------- |
| normalized_discounted_cumulative_gain : float in [0., 1.] |
| The averaged NDCG scores for all samples. |
| |
| See Also |
| -------- |
| dcg_score : Discounted Cumulative Gain (not normalized). |
| |
| References |
| ---------- |
| `Wikipedia entry for Discounted Cumulative Gain |
| <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_ |
| |
| Jarvelin, K., & Kekalainen, J. (2002). |
| Cumulated gain-based evaluation of IR techniques. ACM Transactions on |
| Information Systems (TOIS), 20(4), 422-446. |
| |
| Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). |
| A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th |
| Annual Conference on Learning Theory (COLT 2013) |
| |
| McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
| performance measures efficiently in the presence of tied scores. In |
| European conference on information retrieval (pp. 414-421). Springer, |
| Berlin, Heidelberg. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import ndcg_score |
| >>> # we have ground-truth relevance of some answers to a query: |
| >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) |
| >>> # we predict some scores (relevance) for the answers |
| >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) |
| >>> ndcg_score(true_relevance, scores) |
| np.float64(0.69...) |
| >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]]) |
| >>> ndcg_score(true_relevance, scores) |
| np.float64(0.49...) |
| >>> # we can set k to truncate the sum; only top k answers contribute. |
| >>> ndcg_score(true_relevance, scores, k=4) |
| np.float64(0.35...) |
| >>> # the normalization takes k into account so a perfect answer |
| >>> # would still get 1.0 |
| >>> ndcg_score(true_relevance, true_relevance, k=4) |
| np.float64(1.0...) |
| >>> # now we have some ties in our prediction |
| >>> scores = np.asarray([[1, 0, 0, 0, 1]]) |
| >>> # by default ties are averaged, so here we get the average (normalized) |
| >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75 |
| >>> ndcg_score(true_relevance, scores, k=1) |
| np.float64(0.75...) |
| >>> # we can choose to ignore ties for faster results, but only |
| >>> # if we know there aren't ties in our scores, otherwise we get |
| >>> # wrong results: |
| >>> ndcg_score(true_relevance, |
| ... scores, k=1, ignore_ties=True) |
| np.float64(0.5...) |
| """ |
| y_true = check_array(y_true, ensure_2d=False) |
| y_score = check_array(y_score, ensure_2d=False) |
| check_consistent_length(y_true, y_score, sample_weight) |
|
|
| if y_true.min() < 0: |
| raise ValueError("ndcg_score should not be used on negative y_true values.") |
| if y_true.ndim > 1 and y_true.shape[1] <= 1: |
| raise ValueError( |
| "Computing NDCG is only meaningful when there is more than 1 document. " |
| f"Got {y_true.shape[1]} instead." |
| ) |
| _check_dcg_target_type(y_true) |
| gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties) |
| return np.average(gain, weights=sample_weight) |
|
|
|
|
| @validate_params( |
| { |
| "y_true": ["array-like"], |
| "y_score": ["array-like"], |
| "k": [Interval(Integral, 1, None, closed="left")], |
| "normalize": ["boolean"], |
| "sample_weight": ["array-like", None], |
| "labels": ["array-like", None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def top_k_accuracy_score( |
| y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None |
| ): |
| """Top-k Accuracy classification score. |
| |
| This metric computes the number of times where the correct label is among |
| the top `k` labels predicted (ranked by predicted scores). Note that the |
| multilabel case isn't covered here. |
| |
| Read more in the :ref:`User Guide <top_k_accuracy_score>` |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) |
| True labels. |
| |
| y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
| Target scores. These can be either probability estimates or |
| non-thresholded decision values (as returned by |
| :term:`decision_function` on some classifiers). |
| The binary case expects scores with shape (n_samples,) while the |
| multiclass case expects scores with shape (n_samples, n_classes). |
| In the multiclass case, the order of the class scores must |
| correspond to the order of ``labels``, if provided, or else to |
| the numerical or lexicographical order of the labels in ``y_true``. |
| If ``y_true`` does not contain all the labels, ``labels`` must be |
| provided. |
| |
| k : int, default=2 |
| Number of most likely outcomes considered to find the correct label. |
| |
| normalize : bool, default=True |
| If `True`, return the fraction of correctly classified samples. |
| Otherwise, return the number of correctly classified samples. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If `None`, all samples are given the same weight. |
| |
| labels : array-like of shape (n_classes,), default=None |
| Multiclass only. List of labels that index the classes in ``y_score``. |
| If ``None``, the numerical or lexicographical order of the labels in |
| ``y_true`` is used. If ``y_true`` does not contain all the labels, |
| ``labels`` must be provided. |
| |
| Returns |
| ------- |
| score : float |
| The top-k accuracy score. The best performance is 1 with |
| `normalize == True` and the number of samples with |
| `normalize == False`. |
| |
| See Also |
| -------- |
| accuracy_score : Compute the accuracy score. By default, the function will |
| return the fraction of correct predictions divided by the total number |
| of predictions. |
| |
| Notes |
| ----- |
| In cases where two or more labels are assigned equal predicted scores, |
| the labels with the highest indices will be chosen first. This might |
| impact the result if the correct label falls after the threshold because |
| of that. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics import top_k_accuracy_score |
| >>> y_true = np.array([0, 1, 2, 2]) |
| >>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2 |
| ... [0.3, 0.4, 0.2], # 1 is in top 2 |
| ... [0.2, 0.4, 0.3], # 2 is in top 2 |
| ... [0.7, 0.2, 0.1]]) # 2 isn't in top 2 |
| >>> top_k_accuracy_score(y_true, y_score, k=2) |
| np.float64(0.75) |
| >>> # Not normalizing gives the number of "correctly" classified samples |
| >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False) |
| np.int64(3) |
| """ |
| y_true = check_array(y_true, ensure_2d=False, dtype=None) |
| y_true = column_or_1d(y_true) |
| y_type = type_of_target(y_true, input_name="y_true") |
| if y_type == "binary" and labels is not None and len(labels) > 2: |
| y_type = "multiclass" |
| if y_type not in {"binary", "multiclass"}: |
| raise ValueError( |
| f"y type must be 'binary' or 'multiclass', got '{y_type}' instead." |
| ) |
| y_score = check_array(y_score, ensure_2d=False) |
| if y_type == "binary": |
| if y_score.ndim == 2 and y_score.shape[1] != 1: |
| raise ValueError( |
| "`y_true` is binary while y_score is 2d with" |
| f" {y_score.shape[1]} classes. If `y_true` does not contain all the" |
| " labels, `labels` must be provided." |
| ) |
| y_score = column_or_1d(y_score) |
|
|
| check_consistent_length(y_true, y_score, sample_weight) |
| y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2 |
|
|
| if labels is None: |
| classes = _unique(y_true) |
| n_classes = len(classes) |
|
|
| if n_classes != y_score_n_classes: |
| raise ValueError( |
| f"Number of classes in 'y_true' ({n_classes}) not equal " |
| f"to the number of classes in 'y_score' ({y_score_n_classes})." |
| "You can provide a list of all known classes by assigning it " |
| "to the `labels` parameter." |
| ) |
| else: |
| labels = column_or_1d(labels) |
| classes = _unique(labels) |
| n_labels = len(labels) |
| n_classes = len(classes) |
|
|
| if n_classes != n_labels: |
| raise ValueError("Parameter 'labels' must be unique.") |
|
|
| if not np.array_equal(classes, labels): |
| raise ValueError("Parameter 'labels' must be ordered.") |
|
|
| if n_classes != y_score_n_classes: |
| raise ValueError( |
| f"Number of given labels ({n_classes}) not equal to the " |
| f"number of classes in 'y_score' ({y_score_n_classes})." |
| ) |
|
|
| if len(np.setdiff1d(y_true, classes)): |
| raise ValueError("'y_true' contains labels not in parameter 'labels'.") |
|
|
| if k >= n_classes: |
| warnings.warn( |
| ( |
| f"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) " |
| "will result in a perfect score and is therefore meaningless." |
| ), |
| UndefinedMetricWarning, |
| ) |
|
|
| y_true_encoded = _encode(y_true, uniques=classes) |
|
|
| if y_type == "binary": |
| if k == 1: |
| threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0 |
| y_pred = (y_score > threshold).astype(np.int64) |
| hits = y_pred == y_true_encoded |
| else: |
| hits = np.ones_like(y_score, dtype=np.bool_) |
| elif y_type == "multiclass": |
| sorted_pred = np.argsort(y_score, axis=1, kind="mergesort")[:, ::-1] |
| hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0) |
|
|
| if normalize: |
| return np.average(hits, weights=sample_weight) |
| elif sample_weight is None: |
| return np.sum(hits) |
| else: |
| return np.dot(hits, sample_weight) |
|
|