| """ |
| This module contains loss classes suitable for fitting. |
| |
| It is not part of the public API. |
| Specific losses are used for regression, binary classification or multiclass |
| classification. |
| """ |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import numbers |
|
|
| import numpy as np |
| from scipy.special import xlogy |
|
|
| from ..utils import check_scalar |
| from ..utils.stats import _weighted_percentile |
| from ._loss import ( |
| CyAbsoluteError, |
| CyExponentialLoss, |
| CyHalfBinomialLoss, |
| CyHalfGammaLoss, |
| CyHalfMultinomialLoss, |
| CyHalfPoissonLoss, |
| CyHalfSquaredError, |
| CyHalfTweedieLoss, |
| CyHalfTweedieLossIdentity, |
| CyHuberLoss, |
| CyPinballLoss, |
| ) |
| from .link import ( |
| HalfLogitLink, |
| IdentityLink, |
| Interval, |
| LogitLink, |
| LogLink, |
| MultinomialLogit, |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| class BaseLoss: |
| """Base class for a loss function of 1-dimensional targets. |
| |
| Conventions: |
| |
| - y_true.shape = sample_weight.shape = (n_samples,) |
| - y_pred.shape = raw_prediction.shape = (n_samples,) |
| - If is_multiclass is true (multiclass classification), then |
| y_pred.shape = raw_prediction.shape = (n_samples, n_classes) |
| Note that this corresponds to the return value of decision_function. |
| |
| y_true, y_pred, sample_weight and raw_prediction must either be all float64 |
| or all float32. |
| gradient and hessian must be either both float64 or both float32. |
| |
| Note that y_pred = link.inverse(raw_prediction). |
| |
| Specific loss classes can inherit specific link classes to satisfy |
| BaseLink's abstractmethods. |
| |
| Parameters |
| ---------- |
| sample_weight : {None, ndarray} |
| If sample_weight is None, the hessian might be constant. |
| n_classes : {None, int} |
| The number of classes for classification, else None. |
| |
| Attributes |
| ---------- |
| closs: CyLossFunction |
| link : BaseLink |
| interval_y_true : Interval |
| Valid interval for y_true |
| interval_y_pred : Interval |
| Valid Interval for y_pred |
| differentiable : bool |
| Indicates whether or not loss function is differentiable in |
| raw_prediction everywhere. |
| need_update_leaves_values : bool |
| Indicates whether decision trees in gradient boosting need to uptade |
| leave values after having been fit to the (negative) gradients. |
| approx_hessian : bool |
| Indicates whether the hessian is approximated or exact. If, |
| approximated, it should be larger or equal to the exact one. |
| constant_hessian : bool |
| Indicates whether the hessian is one for this loss. |
| is_multiclass : bool |
| Indicates whether n_classes > 2 is allowed. |
| """ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| differentiable = True |
| need_update_leaves_values = False |
| is_multiclass = False |
|
|
| def __init__(self, closs, link, n_classes=None): |
| self.closs = closs |
| self.link = link |
| self.approx_hessian = False |
| self.constant_hessian = False |
| self.n_classes = n_classes |
| self.interval_y_true = Interval(-np.inf, np.inf, False, False) |
| self.interval_y_pred = self.link.interval_y_pred |
|
|
| def in_y_true_range(self, y): |
| """Return True if y is in the valid range of y_true. |
| |
| Parameters |
| ---------- |
| y : ndarray |
| """ |
| return self.interval_y_true.includes(y) |
|
|
| def in_y_pred_range(self, y): |
| """Return True if y is in the valid range of y_pred. |
| |
| Parameters |
| ---------- |
| y : ndarray |
| """ |
| return self.interval_y_pred.includes(y) |
|
|
| def loss( |
| self, |
| y_true, |
| raw_prediction, |
| sample_weight=None, |
| loss_out=None, |
| n_threads=1, |
| ): |
| """Compute the pointwise loss value for each input. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : C-contiguous array of shape (n_samples,) or array of \ |
| shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| loss_out : None or C-contiguous array of shape (n_samples,) |
| A location into which the result is stored. If None, a new array |
| might be created. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| loss : array of shape (n_samples,) |
| Element-wise loss function. |
| """ |
| if loss_out is None: |
| loss_out = np.empty_like(y_true) |
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
|
|
| self.closs.loss( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=sample_weight, |
| loss_out=loss_out, |
| n_threads=n_threads, |
| ) |
| return loss_out |
|
|
| def loss_gradient( |
| self, |
| y_true, |
| raw_prediction, |
| sample_weight=None, |
| loss_out=None, |
| gradient_out=None, |
| n_threads=1, |
| ): |
| """Compute loss and gradient w.r.t. raw_prediction for each input. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : C-contiguous array of shape (n_samples,) or array of \ |
| shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| loss_out : None or C-contiguous array of shape (n_samples,) |
| A location into which the loss is stored. If None, a new array |
| might be created. |
| gradient_out : None or C-contiguous array of shape (n_samples,) or array \ |
| of shape (n_samples, n_classes) |
| A location into which the gradient is stored. If None, a new array |
| might be created. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| loss : array of shape (n_samples,) |
| Element-wise loss function. |
| |
| gradient : array of shape (n_samples,) or (n_samples, n_classes) |
| Element-wise gradients. |
| """ |
| if loss_out is None: |
| if gradient_out is None: |
| loss_out = np.empty_like(y_true) |
| gradient_out = np.empty_like(raw_prediction) |
| else: |
| loss_out = np.empty_like(y_true, dtype=gradient_out.dtype) |
| elif gradient_out is None: |
| gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype) |
|
|
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
| if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: |
| gradient_out = gradient_out.squeeze(1) |
|
|
| self.closs.loss_gradient( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=sample_weight, |
| loss_out=loss_out, |
| gradient_out=gradient_out, |
| n_threads=n_threads, |
| ) |
| return loss_out, gradient_out |
|
|
| def gradient( |
| self, |
| y_true, |
| raw_prediction, |
| sample_weight=None, |
| gradient_out=None, |
| n_threads=1, |
| ): |
| """Compute gradient of loss w.r.t raw_prediction for each input. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : C-contiguous array of shape (n_samples,) or array of \ |
| shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| gradient_out : None or C-contiguous array of shape (n_samples,) or array \ |
| of shape (n_samples, n_classes) |
| A location into which the result is stored. If None, a new array |
| might be created. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| gradient : array of shape (n_samples,) or (n_samples, n_classes) |
| Element-wise gradients. |
| """ |
| if gradient_out is None: |
| gradient_out = np.empty_like(raw_prediction) |
|
|
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
| if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: |
| gradient_out = gradient_out.squeeze(1) |
|
|
| self.closs.gradient( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=sample_weight, |
| gradient_out=gradient_out, |
| n_threads=n_threads, |
| ) |
| return gradient_out |
|
|
| def gradient_hessian( |
| self, |
| y_true, |
| raw_prediction, |
| sample_weight=None, |
| gradient_out=None, |
| hessian_out=None, |
| n_threads=1, |
| ): |
| """Compute gradient and hessian of loss w.r.t raw_prediction. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : C-contiguous array of shape (n_samples,) or array of \ |
| shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| gradient_out : None or C-contiguous array of shape (n_samples,) or array \ |
| of shape (n_samples, n_classes) |
| A location into which the gradient is stored. If None, a new array |
| might be created. |
| hessian_out : None or C-contiguous array of shape (n_samples,) or array \ |
| of shape (n_samples, n_classes) |
| A location into which the hessian is stored. If None, a new array |
| might be created. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| gradient : arrays of shape (n_samples,) or (n_samples, n_classes) |
| Element-wise gradients. |
| |
| hessian : arrays of shape (n_samples,) or (n_samples, n_classes) |
| Element-wise hessians. |
| """ |
| if gradient_out is None: |
| if hessian_out is None: |
| gradient_out = np.empty_like(raw_prediction) |
| hessian_out = np.empty_like(raw_prediction) |
| else: |
| gradient_out = np.empty_like(hessian_out) |
| elif hessian_out is None: |
| hessian_out = np.empty_like(gradient_out) |
|
|
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
| if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: |
| gradient_out = gradient_out.squeeze(1) |
| if hessian_out.ndim == 2 and hessian_out.shape[1] == 1: |
| hessian_out = hessian_out.squeeze(1) |
|
|
| self.closs.gradient_hessian( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=sample_weight, |
| gradient_out=gradient_out, |
| hessian_out=hessian_out, |
| n_threads=n_threads, |
| ) |
| return gradient_out, hessian_out |
|
|
| def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1): |
| """Compute the weighted average loss. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : C-contiguous array of shape (n_samples,) or array of \ |
| shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| loss : float |
| Mean or averaged loss function. |
| """ |
| return np.average( |
| self.loss( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=None, |
| loss_out=None, |
| n_threads=n_threads, |
| ), |
| weights=sample_weight, |
| ) |
|
|
| def fit_intercept_only(self, y_true, sample_weight=None): |
| """Compute raw_prediction of an intercept-only model. |
| |
| This can be used as initial estimates of predictions, i.e. before the |
| first iteration in fit. |
| |
| Parameters |
| ---------- |
| y_true : array-like of shape (n_samples,) |
| Observed, true target values. |
| sample_weight : None or array of shape (n_samples,) |
| Sample weights. |
| |
| Returns |
| ------- |
| raw_prediction : numpy scalar or array of shape (n_classes,) |
| Raw predictions of an intercept-only model. |
| """ |
| |
| |
| y_pred = np.average(y_true, weights=sample_weight, axis=0) |
| eps = 10 * np.finfo(y_pred.dtype).eps |
|
|
| if self.interval_y_pred.low == -np.inf: |
| a_min = None |
| elif self.interval_y_pred.low_inclusive: |
| a_min = self.interval_y_pred.low |
| else: |
| a_min = self.interval_y_pred.low + eps |
|
|
| if self.interval_y_pred.high == np.inf: |
| a_max = None |
| elif self.interval_y_pred.high_inclusive: |
| a_max = self.interval_y_pred.high |
| else: |
| a_max = self.interval_y_pred.high - eps |
|
|
| if a_min is None and a_max is None: |
| return self.link.link(y_pred) |
| else: |
| return self.link.link(np.clip(y_pred, a_min, a_max)) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| """Calculate term dropped in loss. |
| |
| With this term added, the loss of perfect predictions is zero. |
| """ |
| return np.zeros_like(y_true) |
|
|
| def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"): |
| """Initialize arrays for gradients and hessians. |
| |
| Unless hessians are constant, arrays are initialized with undefined values. |
| |
| Parameters |
| ---------- |
| n_samples : int |
| The number of samples, usually passed to `fit()`. |
| dtype : {np.float64, np.float32}, default=np.float64 |
| The dtype of the arrays gradient and hessian. |
| order : {'C', 'F'}, default='F' |
| Order of the arrays gradient and hessian. The default 'F' makes the arrays |
| contiguous along samples. |
| |
| Returns |
| ------- |
| gradient : C-contiguous array of shape (n_samples,) or array of shape \ |
| (n_samples, n_classes) |
| Empty array (allocated but not initialized) to be used as argument |
| gradient_out. |
| hessian : C-contiguous array of shape (n_samples,), array of shape |
| (n_samples, n_classes) or shape (1,) |
| Empty (allocated but not initialized) array to be used as argument |
| hessian_out. |
| If constant_hessian is True (e.g. `HalfSquaredError`), the array is |
| initialized to ``1``. |
| """ |
| if dtype not in (np.float32, np.float64): |
| raise ValueError( |
| "Valid options for 'dtype' are np.float32 and np.float64. " |
| f"Got dtype={dtype} instead." |
| ) |
|
|
| if self.is_multiclass: |
| shape = (n_samples, self.n_classes) |
| else: |
| shape = (n_samples,) |
| gradient = np.empty(shape=shape, dtype=dtype, order=order) |
|
|
| if self.constant_hessian: |
| |
| |
| |
| |
| hessian = np.ones(shape=(1,), dtype=dtype) |
| else: |
| hessian = np.empty(shape=shape, dtype=dtype, order=order) |
|
|
| return gradient, hessian |
|
|
|
|
| |
| |
| |
| |
| class HalfSquaredError(BaseLoss): |
| """Half squared error with identity link, for regression. |
| |
| Domain: |
| y_true and y_pred all real numbers |
| |
| Link: |
| y_pred = raw_prediction |
| |
| For a given sample x_i, half squared error is defined as:: |
| |
| loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2 |
| |
| The factor of 0.5 simplifies the computation of gradients and results in a |
| unit hessian (and is consistent with what is done in LightGBM). It is also |
| half the Normal distribution deviance. |
| """ |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__(closs=CyHalfSquaredError(), link=IdentityLink()) |
| self.constant_hessian = sample_weight is None |
|
|
|
|
| class AbsoluteError(BaseLoss): |
| """Absolute error with identity link, for regression. |
| |
| Domain: |
| y_true and y_pred all real numbers |
| |
| Link: |
| y_pred = raw_prediction |
| |
| For a given sample x_i, the absolute error is defined as:: |
| |
| loss(x_i) = |y_true_i - raw_prediction_i| |
| |
| Note that the exact hessian = 0 almost everywhere (except at one point, therefore |
| differentiable = False). Optimization routines like in HGBT, however, need a |
| hessian > 0. Therefore, we assign 1. |
| """ |
|
|
| differentiable = False |
| need_update_leaves_values = True |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__(closs=CyAbsoluteError(), link=IdentityLink()) |
| self.approx_hessian = True |
| self.constant_hessian = sample_weight is None |
|
|
| def fit_intercept_only(self, y_true, sample_weight=None): |
| """Compute raw_prediction of an intercept-only model. |
| |
| This is the weighted median of the target, i.e. over the samples |
| axis=0. |
| """ |
| if sample_weight is None: |
| return np.median(y_true, axis=0) |
| else: |
| return _weighted_percentile(y_true, sample_weight, 50) |
|
|
|
|
| class PinballLoss(BaseLoss): |
| """Quantile loss aka pinball loss, for regression. |
| |
| Domain: |
| y_true and y_pred all real numbers |
| quantile in (0, 1) |
| |
| Link: |
| y_pred = raw_prediction |
| |
| For a given sample x_i, the pinball loss is defined as:: |
| |
| loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i) |
| |
| rho_{quantile}(u) = u * (quantile - 1_{u<0}) |
| = -u *(1 - quantile) if u < 0 |
| u * quantile if u >= 0 |
| |
| Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError(). |
| |
| Note that the exact hessian = 0 almost everywhere (except at one point, therefore |
| differentiable = False). Optimization routines like in HGBT, however, need a |
| hessian > 0. Therefore, we assign 1. |
| |
| Additional Attributes |
| --------------------- |
| quantile : float |
| The quantile level of the quantile to be estimated. Must be in range (0, 1). |
| """ |
|
|
| differentiable = False |
| need_update_leaves_values = True |
|
|
| def __init__(self, sample_weight=None, quantile=0.5): |
| check_scalar( |
| quantile, |
| "quantile", |
| target_type=numbers.Real, |
| min_val=0, |
| max_val=1, |
| include_boundaries="neither", |
| ) |
| super().__init__( |
| closs=CyPinballLoss(quantile=float(quantile)), |
| link=IdentityLink(), |
| ) |
| self.approx_hessian = True |
| self.constant_hessian = sample_weight is None |
|
|
| def fit_intercept_only(self, y_true, sample_weight=None): |
| """Compute raw_prediction of an intercept-only model. |
| |
| This is the weighted median of the target, i.e. over the samples |
| axis=0. |
| """ |
| if sample_weight is None: |
| return np.percentile(y_true, 100 * self.closs.quantile, axis=0) |
| else: |
| return _weighted_percentile( |
| y_true, sample_weight, 100 * self.closs.quantile |
| ) |
|
|
|
|
| class HuberLoss(BaseLoss): |
| """Huber loss, for regression. |
| |
| Domain: |
| y_true and y_pred all real numbers |
| quantile in (0, 1) |
| |
| Link: |
| y_pred = raw_prediction |
| |
| For a given sample x_i, the Huber loss is defined as:: |
| |
| loss(x_i) = 1/2 * abserr**2 if abserr <= delta |
| delta * (abserr - delta/2) if abserr > delta |
| |
| abserr = |y_true_i - raw_prediction_i| |
| delta = quantile(abserr, self.quantile) |
| |
| Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0) |
| equals delta * (AbsoluteError() - delta/2). |
| |
| Additional Attributes |
| --------------------- |
| quantile : float |
| The quantile level which defines the breaking point `delta` to distinguish |
| between absolute error and squared error. Must be in range (0, 1). |
| |
| Reference |
| --------- |
| .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient |
| boosting machine <10.1214/aos/1013203451>`. |
| Annals of Statistics, 29, 1189-1232. |
| """ |
|
|
| differentiable = False |
| need_update_leaves_values = True |
|
|
| def __init__(self, sample_weight=None, quantile=0.9, delta=0.5): |
| check_scalar( |
| quantile, |
| "quantile", |
| target_type=numbers.Real, |
| min_val=0, |
| max_val=1, |
| include_boundaries="neither", |
| ) |
| self.quantile = quantile |
| super().__init__( |
| closs=CyHuberLoss(delta=float(delta)), |
| link=IdentityLink(), |
| ) |
| self.approx_hessian = True |
| self.constant_hessian = False |
|
|
| def fit_intercept_only(self, y_true, sample_weight=None): |
| """Compute raw_prediction of an intercept-only model. |
| |
| This is the weighted median of the target, i.e. over the samples |
| axis=0. |
| """ |
| |
| |
| |
| |
| if sample_weight is None: |
| median = np.percentile(y_true, 50, axis=0) |
| else: |
| median = _weighted_percentile(y_true, sample_weight, 50) |
| diff = y_true - median |
| term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff)) |
| return median + np.average(term, weights=sample_weight) |
|
|
|
|
| class HalfPoissonLoss(BaseLoss): |
| """Half Poisson deviance loss with log-link, for regression. |
| |
| Domain: |
| y_true in non-negative real numbers |
| y_pred in positive real numbers |
| |
| Link: |
| y_pred = exp(raw_prediction) |
| |
| For a given sample x_i, half the Poisson deviance is defined as:: |
| |
| loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i)) |
| - y_true_i + exp(raw_prediction_i) |
| |
| Half the Poisson deviance is actually the negative log-likelihood up to |
| constant terms (not involving raw_prediction) and simplifies the |
| computation of the gradients. |
| We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`. |
| """ |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__(closs=CyHalfPoissonLoss(), link=LogLink()) |
| self.interval_y_true = Interval(0, np.inf, True, False) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| term = xlogy(y_true, y_true) - y_true |
| if sample_weight is not None: |
| term *= sample_weight |
| return term |
|
|
|
|
| class HalfGammaLoss(BaseLoss): |
| """Half Gamma deviance loss with log-link, for regression. |
| |
| Domain: |
| y_true and y_pred in positive real numbers |
| |
| Link: |
| y_pred = exp(raw_prediction) |
| |
| For a given sample x_i, half Gamma deviance loss is defined as:: |
| |
| loss(x_i) = log(exp(raw_prediction_i)/y_true_i) |
| + y_true/exp(raw_prediction_i) - 1 |
| |
| Half the Gamma deviance is actually proportional to the negative log- |
| likelihood up to constant terms (not involving raw_prediction) and |
| simplifies the computation of the gradients. |
| We also skip the constant term `-log(y_true_i) - 1`. |
| """ |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__(closs=CyHalfGammaLoss(), link=LogLink()) |
| self.interval_y_true = Interval(0, np.inf, False, False) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| term = -np.log(y_true) - 1 |
| if sample_weight is not None: |
| term *= sample_weight |
| return term |
|
|
|
|
| class HalfTweedieLoss(BaseLoss): |
| """Half Tweedie deviance loss with log-link, for regression. |
| |
| Domain: |
| y_true in real numbers for power <= 0 |
| y_true in non-negative real numbers for 0 < power < 2 |
| y_true in positive real numbers for 2 <= power |
| y_pred in positive real numbers |
| power in real numbers |
| |
| Link: |
| y_pred = exp(raw_prediction) |
| |
| For a given sample x_i, half Tweedie deviance loss with p=power is defined |
| as:: |
| |
| loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p) |
| - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p) |
| + exp(raw_prediction_i)**(2-p) / (2-p) |
| |
| Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link, |
| HalfPoissonLoss and HalfGammaLoss. |
| |
| We also skip constant terms, but those are different for p=0, 1, 2. |
| Therefore, the loss is not continuous in `power`. |
| |
| Note furthermore that although no Tweedie distribution exists for |
| 0 < power < 1, it still gives a strictly consistent scoring function for |
| the expectation. |
| """ |
|
|
| def __init__(self, sample_weight=None, power=1.5): |
| super().__init__( |
| closs=CyHalfTweedieLoss(power=float(power)), |
| link=LogLink(), |
| ) |
| if self.closs.power <= 0: |
| self.interval_y_true = Interval(-np.inf, np.inf, False, False) |
| elif self.closs.power < 2: |
| self.interval_y_true = Interval(0, np.inf, True, False) |
| else: |
| self.interval_y_true = Interval(0, np.inf, False, False) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| if self.closs.power == 0: |
| return HalfSquaredError().constant_to_optimal_zero( |
| y_true=y_true, sample_weight=sample_weight |
| ) |
| elif self.closs.power == 1: |
| return HalfPoissonLoss().constant_to_optimal_zero( |
| y_true=y_true, sample_weight=sample_weight |
| ) |
| elif self.closs.power == 2: |
| return HalfGammaLoss().constant_to_optimal_zero( |
| y_true=y_true, sample_weight=sample_weight |
| ) |
| else: |
| p = self.closs.power |
| term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p) |
| if sample_weight is not None: |
| term *= sample_weight |
| return term |
|
|
|
|
| class HalfTweedieLossIdentity(BaseLoss): |
| """Half Tweedie deviance loss with identity link, for regression. |
| |
| Domain: |
| y_true in real numbers for power <= 0 |
| y_true in non-negative real numbers for 0 < power < 2 |
| y_true in positive real numbers for 2 <= power |
| y_pred in positive real numbers for power != 0 |
| y_pred in real numbers for power = 0 |
| power in real numbers |
| |
| Link: |
| y_pred = raw_prediction |
| |
| For a given sample x_i, half Tweedie deviance loss with p=power is defined |
| as:: |
| |
| loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p) |
| - y_true_i * raw_prediction_i**(1-p) / (1-p) |
| + raw_prediction_i**(2-p) / (2-p) |
| |
| Note that the minimum value of this loss is 0. |
| |
| Note furthermore that although no Tweedie distribution exists for |
| 0 < power < 1, it still gives a strictly consistent scoring function for |
| the expectation. |
| """ |
|
|
| def __init__(self, sample_weight=None, power=1.5): |
| super().__init__( |
| closs=CyHalfTweedieLossIdentity(power=float(power)), |
| link=IdentityLink(), |
| ) |
| if self.closs.power <= 0: |
| self.interval_y_true = Interval(-np.inf, np.inf, False, False) |
| elif self.closs.power < 2: |
| self.interval_y_true = Interval(0, np.inf, True, False) |
| else: |
| self.interval_y_true = Interval(0, np.inf, False, False) |
|
|
| if self.closs.power == 0: |
| self.interval_y_pred = Interval(-np.inf, np.inf, False, False) |
| else: |
| self.interval_y_pred = Interval(0, np.inf, False, False) |
|
|
|
|
| class HalfBinomialLoss(BaseLoss): |
| """Half Binomial deviance loss with logit link, for binary classification. |
| |
| This is also know as binary cross entropy, log-loss and logistic loss. |
| |
| Domain: |
| y_true in [0, 1], i.e. regression on the unit interval |
| y_pred in (0, 1), i.e. boundaries excluded |
| |
| Link: |
| y_pred = expit(raw_prediction) |
| |
| For a given sample x_i, half Binomial deviance is defined as the negative |
| log-likelihood of the Binomial/Bernoulli distribution and can be expressed |
| as:: |
| |
| loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i |
| |
| See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman, |
| section 4.4.1 (about logistic regression). |
| |
| Note that the formulation works for classification, y = {0, 1}, as well as |
| logistic regression, y = [0, 1]. |
| If you add `constant_to_optimal_zero` to the loss, you get half the |
| Bernoulli/binomial deviance. |
| |
| More details: Inserting the predicted probability y_pred = expit(raw_prediction) |
| in the loss gives the well known:: |
| |
| loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i) |
| """ |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__( |
| closs=CyHalfBinomialLoss(), |
| link=LogitLink(), |
| n_classes=2, |
| ) |
| self.interval_y_true = Interval(0, 1, True, True) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| |
| term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true) |
| if sample_weight is not None: |
| term *= sample_weight |
| return term |
|
|
| def predict_proba(self, raw_prediction): |
| """Predict probabilities. |
| |
| Parameters |
| ---------- |
| raw_prediction : array of shape (n_samples,) or (n_samples, 1) |
| Raw prediction values (in link space). |
| |
| Returns |
| ------- |
| proba : array of shape (n_samples, 2) |
| Element-wise class probabilities. |
| """ |
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
| proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype) |
| proba[:, 1] = self.link.inverse(raw_prediction) |
| proba[:, 0] = 1 - proba[:, 1] |
| return proba |
|
|
|
|
| class HalfMultinomialLoss(BaseLoss): |
| """Categorical cross-entropy loss, for multiclass classification. |
| |
| Domain: |
| y_true in {0, 1, 2, 3, .., n_classes - 1} |
| y_pred has n_classes elements, each element in (0, 1) |
| |
| Link: |
| y_pred = softmax(raw_prediction) |
| |
| Note: We assume y_true to be already label encoded. The inverse link is |
| softmax. But the full link function is the symmetric multinomial logit |
| function. |
| |
| For a given sample x_i, the categorical cross-entropy loss is defined as |
| the negative log-likelihood of the multinomial distribution, it |
| generalizes the binary cross-entropy to more than 2 classes:: |
| |
| loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1)) |
| - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1) |
| |
| See [1]. |
| |
| Note that for the hessian, we calculate only the diagonal part in the |
| classes: If the full hessian for classes k and l and sample i is H_i_k_l, |
| we calculate H_i_k_k, i.e. k=l. |
| |
| Reference |
| --------- |
| .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie. |
| "A Blockwise Descent Algorithm for Group-penalized Multiresponse and |
| Multinomial Regression". |
| <1311.6529>` |
| """ |
|
|
| is_multiclass = True |
|
|
| def __init__(self, sample_weight=None, n_classes=3): |
| super().__init__( |
| closs=CyHalfMultinomialLoss(), |
| link=MultinomialLogit(), |
| n_classes=n_classes, |
| ) |
| self.interval_y_true = Interval(0, np.inf, True, False) |
| self.interval_y_pred = Interval(0, 1, False, False) |
|
|
| def in_y_true_range(self, y): |
| """Return True if y is in the valid range of y_true. |
| |
| Parameters |
| ---------- |
| y : ndarray |
| """ |
| return self.interval_y_true.includes(y) and np.all(y.astype(int) == y) |
|
|
| def fit_intercept_only(self, y_true, sample_weight=None): |
| """Compute raw_prediction of an intercept-only model. |
| |
| This is the softmax of the weighted average of the target, i.e. over |
| the samples axis=0. |
| """ |
| out = np.zeros(self.n_classes, dtype=y_true.dtype) |
| eps = np.finfo(y_true.dtype).eps |
| for k in range(self.n_classes): |
| out[k] = np.average(y_true == k, weights=sample_weight, axis=0) |
| out[k] = np.clip(out[k], eps, 1 - eps) |
| return self.link.link(out[None, :]).reshape(-1) |
|
|
| def predict_proba(self, raw_prediction): |
| """Predict probabilities. |
| |
| Parameters |
| ---------- |
| raw_prediction : array of shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| |
| Returns |
| ------- |
| proba : array of shape (n_samples, n_classes) |
| Element-wise class probabilities. |
| """ |
| return self.link.inverse(raw_prediction) |
|
|
| def gradient_proba( |
| self, |
| y_true, |
| raw_prediction, |
| sample_weight=None, |
| gradient_out=None, |
| proba_out=None, |
| n_threads=1, |
| ): |
| """Compute gradient and class probabilities fow raw_prediction. |
| |
| Parameters |
| ---------- |
| y_true : C-contiguous array of shape (n_samples,) |
| Observed, true target values. |
| raw_prediction : array of shape (n_samples, n_classes) |
| Raw prediction values (in link space). |
| sample_weight : None or C-contiguous array of shape (n_samples,) |
| Sample weights. |
| gradient_out : None or array of shape (n_samples, n_classes) |
| A location into which the gradient is stored. If None, a new array |
| might be created. |
| proba_out : None or array of shape (n_samples, n_classes) |
| A location into which the class probabilities are stored. If None, |
| a new array might be created. |
| n_threads : int, default=1 |
| Might use openmp thread parallelism. |
| |
| Returns |
| ------- |
| gradient : array of shape (n_samples, n_classes) |
| Element-wise gradients. |
| |
| proba : array of shape (n_samples, n_classes) |
| Element-wise class probabilities. |
| """ |
| if gradient_out is None: |
| if proba_out is None: |
| gradient_out = np.empty_like(raw_prediction) |
| proba_out = np.empty_like(raw_prediction) |
| else: |
| gradient_out = np.empty_like(proba_out) |
| elif proba_out is None: |
| proba_out = np.empty_like(gradient_out) |
|
|
| self.closs.gradient_proba( |
| y_true=y_true, |
| raw_prediction=raw_prediction, |
| sample_weight=sample_weight, |
| gradient_out=gradient_out, |
| proba_out=proba_out, |
| n_threads=n_threads, |
| ) |
| return gradient_out, proba_out |
|
|
|
|
| class ExponentialLoss(BaseLoss): |
| """Exponential loss with (half) logit link, for binary classification. |
| |
| This is also know as boosting loss. |
| |
| Domain: |
| y_true in [0, 1], i.e. regression on the unit interval |
| y_pred in (0, 1), i.e. boundaries excluded |
| |
| Link: |
| y_pred = expit(2 * raw_prediction) |
| |
| For a given sample x_i, the exponential loss is defined as:: |
| |
| loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i) |
| |
| See: |
| - J. Friedman, T. Hastie, R. Tibshirani. |
| "Additive logistic regression: a statistical view of boosting (With discussion |
| and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000. |
| https://doi.org/10.1214/aos/1016218223 |
| - A. Buja, W. Stuetzle, Y. Shen. (2005). |
| "Loss Functions for Binary Class Probability Estimation and Classification: |
| Structure and Applications." |
| |
| Note that the formulation works for classification, y = {0, 1}, as well as |
| "exponential logistic" regression, y = [0, 1]. |
| Note that this is a proper scoring rule, but without it's canonical link. |
| |
| More details: Inserting the predicted probability |
| y_pred = expit(2 * raw_prediction) in the loss gives:: |
| |
| loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i) |
| + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i)) |
| """ |
|
|
| def __init__(self, sample_weight=None): |
| super().__init__( |
| closs=CyExponentialLoss(), |
| link=HalfLogitLink(), |
| n_classes=2, |
| ) |
| self.interval_y_true = Interval(0, 1, True, True) |
|
|
| def constant_to_optimal_zero(self, y_true, sample_weight=None): |
| |
| term = -2 * np.sqrt(y_true * (1 - y_true)) |
| if sample_weight is not None: |
| term *= sample_weight |
| return term |
|
|
| def predict_proba(self, raw_prediction): |
| """Predict probabilities. |
| |
| Parameters |
| ---------- |
| raw_prediction : array of shape (n_samples,) or (n_samples, 1) |
| Raw prediction values (in link space). |
| |
| Returns |
| ------- |
| proba : array of shape (n_samples, 2) |
| Element-wise class probabilities. |
| """ |
| |
| if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: |
| raw_prediction = raw_prediction.squeeze(1) |
| proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype) |
| proba[:, 1] = self.link.inverse(raw_prediction) |
| proba[:, 0] = 1 - proba[:, 1] |
| return proba |
|
|
|
|
| _LOSSES = { |
| "squared_error": HalfSquaredError, |
| "absolute_error": AbsoluteError, |
| "pinball_loss": PinballLoss, |
| "huber_loss": HuberLoss, |
| "poisson_loss": HalfPoissonLoss, |
| "gamma_loss": HalfGammaLoss, |
| "tweedie_loss": HalfTweedieLoss, |
| "binomial_loss": HalfBinomialLoss, |
| "multinomial_loss": HalfMultinomialLoss, |
| "exponential_loss": ExponentialLoss, |
| } |
|
|