Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 12 months ago

53.7 kB

	{{py:

	"""
	Template file to easily generate loops over samples using Tempita
	(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).

	Generated file: _loss.pyx

	Each loss class is generated by a cdef functions on single samples.
	The keywords between double braces are substituted during the build.
	"""

	doc_HalfSquaredError = (
	"""Half Squared Error with identity link.

	Domain:
	y_true and y_pred all real numbers

	Link:
	y_pred = raw_prediction
	"""
	)

	doc_AbsoluteError = (
	"""Absolute Error with identity link.

	Domain:
	y_true and y_pred all real numbers

	Link:
	y_pred = raw_prediction
	"""
	)

	doc_PinballLoss = (
	"""Quantile Loss aka Pinball Loss with identity link.

	Domain:
	y_true and y_pred all real numbers
	quantile in (0, 1)

	Link:
	y_pred = raw_prediction

	Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
	"""
	)

	doc_HuberLoss = (
	"""Huber Loss with identity link.

	Domain:
	y_true and y_pred all real numbers
	delta in positive real numbers

	Link:
	y_pred = raw_prediction
	"""
	)

	doc_HalfPoissonLoss = (
	"""Half Poisson deviance loss with log-link.

	Domain:
	y_true in non-negative real numbers
	y_pred in positive real numbers

	Link:
	y_pred = exp(raw_prediction)

	Half Poisson deviance with log-link is
	y_true * log(y_true/y_pred) + y_pred - y_true
	= y_true * log(y_true) - y_true * raw_prediction
	+ exp(raw_prediction) - y_true

	Dropping constant terms, this gives:
	exp(raw_prediction) - y_true * raw_prediction
	"""
	)

	doc_HalfGammaLoss = (
	"""Half Gamma deviance loss with log-link.

	Domain:
	y_true and y_pred in positive real numbers

	Link:
	y_pred = exp(raw_prediction)

	Half Gamma deviance with log-link is
	log(y_pred/y_true) + y_true/y_pred - 1
	= raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1

	Dropping constant terms, this gives:
	raw_prediction + y_true * exp(-raw_prediction)
	"""
	)

	doc_HalfTweedieLoss = (
	"""Half Tweedie deviance loss with log-link.

	Domain:
	y_true in real numbers if p <= 0
	y_true in non-negative real numbers if 0 < p < 2
	y_true in positive real numbers if p >= 2
	y_pred and power in positive real numbers

	Link:
	y_pred = exp(raw_prediction)

	Half Tweedie deviance with log-link and p=power is
	max(y_true, 0)**(2-p) / (1-p) / (2-p)
	- y_true * y_pred**(1-p) / (1-p)
	+ y_pred**(2-p) / (2-p)
	= max(y_true, 0)**(2-p) / (1-p) / (2-p)
	- y_true * exp((1-p) * raw_prediction) / (1-p)
	+ exp((2-p) * raw_prediction) / (2-p)

	Dropping constant terms, this gives:
	exp((2-p) * raw_prediction) / (2-p)
	- y_true * exp((1-p) * raw_prediction) / (1-p)

	Notes:
	- Poisson with p=1 and and Gamma with p=2 have different terms dropped such
	that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
	- While the Tweedie distribution only exists for p<=0 or p>=1, the range
	0<p<1 still gives a strictly consistent scoring function for the
	expectation.
	"""
	)

	doc_HalfTweedieLossIdentity = (
	"""Half Tweedie deviance loss with identity link.

	Domain:
	y_true in real numbers if p <= 0
	y_true in non-negative real numbers if 0 < p < 2
	y_true in positive real numbers if p >= 2
	y_pred and power in positive real numbers, y_pred may be negative for p=0.

	Link:
	y_pred = raw_prediction

	Half Tweedie deviance with identity link and p=power is
	max(y_true, 0)**(2-p) / (1-p) / (2-p)
	- y_true * y_pred**(1-p) / (1-p)
	+ y_pred**(2-p) / (2-p)

	Notes:
	- Here, we do not drop constant terms in contrast to the version with log-link.
	"""
	)

	doc_HalfBinomialLoss = (
	"""Half Binomial deviance loss with logit link.

	Domain:
	y_true in [0, 1]
	y_pred in (0, 1), i.e. boundaries excluded

	Link:
	y_pred = expit(raw_prediction)
	"""
	)

	doc_ExponentialLoss = (
	""""Exponential loss with (half) logit link

	Domain:
	y_true in [0, 1]
	y_pred in (0, 1), i.e. boundaries excluded

	Link:
	y_pred = expit(2 * raw_prediction)
	"""
	)

	# loss class name, docstring, param,
	# cy_loss, cy_loss_grad,
	# cy_grad, cy_grad_hess,
	class_list = [
	("CyHalfSquaredError", doc_HalfSquaredError, None,
	"closs_half_squared_error", None,
	"cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
	("CyAbsoluteError", doc_AbsoluteError, None,
	"closs_absolute_error", None,
	"cgradient_absolute_error", "cgrad_hess_absolute_error"),
	("CyPinballLoss", doc_PinballLoss, "quantile",
	"closs_pinball_loss", None,
	"cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
	("CyHuberLoss", doc_HuberLoss, "delta",
	"closs_huber_loss", None,
	"cgradient_huber_loss", "cgrad_hess_huber_loss"),
	("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
	"closs_half_poisson", "closs_grad_half_poisson",
	"cgradient_half_poisson", "cgrad_hess_half_poisson"),
	("CyHalfGammaLoss", doc_HalfGammaLoss, None,
	"closs_half_gamma", "closs_grad_half_gamma",
	"cgradient_half_gamma", "cgrad_hess_half_gamma"),
	("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
	"closs_half_tweedie", "closs_grad_half_tweedie",
	"cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
	("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
	"closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
	"cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
	("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
	"closs_half_binomial", "closs_grad_half_binomial",
	"cgradient_half_binomial", "cgrad_hess_half_binomial"),
	("CyExponentialLoss", doc_ExponentialLoss, None,
	"closs_exponential", "closs_grad_exponential",
	"cgradient_exponential", "cgrad_hess_exponential"),
	]
	}}

	# Design:
	# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
	# a) Merge link functions into loss functions for speed and numerical
	# stability, i.e. use raw_prediction instead of y_pred in signature.
	# b) Pure C functions (nogil) calculate single points (single sample)
	# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
	# - Write loops manually---use Tempita for this.
	# Reason: There is still some performance overhead when using a wrapper
	# function "wrap" that carries out the loop and gets as argument a function
	# pointer to one of the C functions from b), e.g.
	# wrap(closs_half_poisson, y_true, ...)
	# - Pass n_threads as argument to prange and propagate option to all callers.
	# d) Provide classes (Cython extension types) per loss (names start with Cy) in
	# order to have semantical structured objects.
	# - Member functions for single points just call the C function from b).
	# These are used e.g. in SGD `_plain_sgd`.
	# - Member functions operating on ndarrays, see c), looping over calls to C
	# functions from b).
	# e) Provide convenience Python classes that compose from these extension types
	# elsewhere (see loss.py)
	# - Example: loss.gradient calls CyLoss.gradient but does some input
	# checking like None -> np.empty().
	#
	# Note: We require 1-dim ndarrays to be contiguous.

	from cython.parallel import parallel, prange
	import numpy as np

	from libc.math cimport exp, fabs, log, log1p, pow
	from libc.stdlib cimport malloc, free


	# -------------------------------------
	# Helper functions
	# -------------------------------------
	# Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of
	# https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
	# Note: The only important cutoff is at x = 18. All others are to save computation
	# time. Compared to the reference, we add the additional case distinction x <= -2 in
	# order to use log instead of log1p for improved performance. As with the other
	# cutoffs, this is accurate within machine precision of double.
	cdef inline double log1pexp(double x) noexcept nogil:
	if x <= -37:
	return exp(x)
	elif x <= -2:
	return log1p(exp(x))
	elif x <= 18:
	return log(1. + exp(x))
	elif x <= 33.3:
	return x + exp(-x)
	else:
	return x


	cdef inline double_pair sum_exp_minus_max(
	const int i,
	const floating_in[:, :] raw_prediction, # IN
	floating_out *p # OUT
	) noexcept nogil:
	# Thread local buffers are used to store part of the results via p.
	# The results are stored as follows:
	# p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
	# return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1)
	# return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
	# len(p) must be n_classes
	# Notes:
	# - We return the max value and sum of exps (stored in p) as a double_pair.
	# - i needs to be passed (and stays constant) because otherwise Cython does
	# not generate optimal code, see
	# https://github.com/scikit-learn/scikit-learn/issues/17299
	# - We do not normalize p by calculating p[k] = p[k] / sum_exps.
	# This helps to save one loop over k.
	cdef:
	int k
	int n_classes = raw_prediction.shape[1]
	double_pair max_value_and_sum_exps # val1 = max_value, val2 = sum_exps

	max_value_and_sum_exps.val1 = raw_prediction[i, 0]
	max_value_and_sum_exps.val2 = 0
	for k in range(1, n_classes):
	# Compute max value of array for numerical stability
	if max_value_and_sum_exps.val1 < raw_prediction[i, k]:
	max_value_and_sum_exps.val1 = raw_prediction[i, k]

	for k in range(n_classes):
	p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1)
	max_value_and_sum_exps.val2 += p[k]

	return max_value_and_sum_exps


	# -------------------------------------
	# Single point inline C functions
	# -------------------------------------
	# Half Squared Error
	cdef inline double closs_half_squared_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)


	cdef inline double cgradient_half_squared_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return raw_prediction - y_true


	cdef inline double_pair cgrad_hess_half_squared_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair gh
	gh.val1 = raw_prediction - y_true # gradient
	gh.val2 = 1. # hessian
	return gh


	# Absolute Error
	cdef inline double closs_absolute_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return fabs(raw_prediction - y_true)


	cdef inline double cgradient_absolute_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return 1. if raw_prediction > y_true else -1.


	cdef inline double_pair cgrad_hess_absolute_error(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair gh
	# Note that exact hessian = 0 almost everywhere. Optimization routines like
	# in HGBT, however, need a hessian > 0. Therefore, we assign 1.
	gh.val1 = 1. if raw_prediction > y_true else -1. # gradient
	gh.val2 = 1. # hessian
	return gh


	# Quantile Loss / Pinball Loss
	cdef inline double closs_pinball_loss(
	double y_true,
	double raw_prediction,
	double quantile
	) noexcept nogil:
	return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
	else (1. - quantile) * (raw_prediction - y_true))


	cdef inline double cgradient_pinball_loss(
	double y_true,
	double raw_prediction,
	double quantile
	) noexcept nogil:
	return -quantile if y_true >=raw_prediction else 1. - quantile


	cdef inline double_pair cgrad_hess_pinball_loss(
	double y_true,
	double raw_prediction,
	double quantile
	) noexcept nogil:
	cdef double_pair gh
	# Note that exact hessian = 0 almost everywhere. Optimization routines like
	# in HGBT, however, need a hessian > 0. Therefore, we assign 1.
	gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile # gradient
	gh.val2 = 1. # hessian
	return gh


	# Huber Loss
	cdef inline double closs_huber_loss(
	double y_true,
	double raw_prediction,
	double delta,
	) noexcept nogil:
	cdef double abserr = fabs(y_true - raw_prediction)
	if abserr <= delta:
	return 0.5 * abserr**2
	else:
	return delta * (abserr - 0.5 * delta)


	cdef inline double cgradient_huber_loss(
	double y_true,
	double raw_prediction,
	double delta,
	) noexcept nogil:
	cdef double res = raw_prediction - y_true
	if fabs(res) <= delta:
	return res
	else:
	return delta if res >=0 else -delta


	cdef inline double_pair cgrad_hess_huber_loss(
	double y_true,
	double raw_prediction,
	double delta,
	) noexcept nogil:
	cdef double_pair gh
	gh.val2 = raw_prediction - y_true # used as temporary
	if fabs(gh.val2) <= delta:
	gh.val1 = gh.val2 # gradient
	gh.val2 = 1 # hessian
	else:
	gh.val1 = delta if gh.val2 >=0 else -delta # gradient
	gh.val2 = 0 # hessian
	return gh


	# Half Poisson Deviance with Log-Link, dropping constant terms
	cdef inline double closs_half_poisson(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return exp(raw_prediction) - y_true * raw_prediction


	cdef inline double cgradient_half_poisson(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	# y_pred - y_true
	return exp(raw_prediction) - y_true


	cdef inline double_pair closs_grad_half_poisson(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair lg
	lg.val2 = exp(raw_prediction) # used as temporary
	lg.val1 = lg.val2 - y_true * raw_prediction # loss
	lg.val2 -= y_true # gradient
	return lg


	cdef inline double_pair cgrad_hess_half_poisson(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair gh
	gh.val2 = exp(raw_prediction) # hessian
	gh.val1 = gh.val2 - y_true # gradient
	return gh


	# Half Gamma Deviance with Log-Link, dropping constant terms
	cdef inline double closs_half_gamma(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return raw_prediction + y_true * exp(-raw_prediction)


	cdef inline double cgradient_half_gamma(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	return 1. - y_true * exp(-raw_prediction)


	cdef inline double_pair closs_grad_half_gamma(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair lg
	lg.val2 = exp(-raw_prediction) # used as temporary
	lg.val1 = raw_prediction + y_true * lg.val2 # loss
	lg.val2 = 1. - y_true * lg.val2 # gradient
	return lg


	cdef inline double_pair cgrad_hess_half_gamma(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair gh
	gh.val2 = exp(-raw_prediction) # used as temporary
	gh.val1 = 1. - y_true * gh.val2 # gradient
	gh.val2 *= y_true # hessian
	return gh


	# Half Tweedie Deviance with Log-Link, dropping constant terms
	# Note that by dropping constants this is no longer continuous in parameter power.
	cdef inline double closs_half_tweedie(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	if power == 0.:
	return closs_half_squared_error(y_true, exp(raw_prediction))
	elif power == 1.:
	return closs_half_poisson(y_true, raw_prediction)
	elif power == 2.:
	return closs_half_gamma(y_true, raw_prediction)
	else:
	return (exp((2. - power) * raw_prediction) / (2. - power)
	- y_true * exp((1. - power) * raw_prediction) / (1. - power))


	cdef inline double cgradient_half_tweedie(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double exp1
	if power == 0.:
	exp1 = exp(raw_prediction)
	return exp1 * (exp1 - y_true)
	elif power == 1.:
	return cgradient_half_poisson(y_true, raw_prediction)
	elif power == 2.:
	return cgradient_half_gamma(y_true, raw_prediction)
	else:
	return (exp((2. - power) * raw_prediction)
	- y_true * exp((1. - power) * raw_prediction))


	cdef inline double_pair closs_grad_half_tweedie(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double_pair lg
	cdef double exp1, exp2
	if power == 0.:
	exp1 = exp(raw_prediction)
	lg.val1 = closs_half_squared_error(y_true, exp1) # loss
	lg.val2 = exp1 * (exp1 - y_true) # gradient
	elif power == 1.:
	return closs_grad_half_poisson(y_true, raw_prediction)
	elif power == 2.:
	return closs_grad_half_gamma(y_true, raw_prediction)
	else:
	exp1 = exp((1. - power) * raw_prediction)
	exp2 = exp((2. - power) * raw_prediction)
	lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power) # loss
	lg.val2 = exp2 - y_true * exp1 # gradient
	return lg


	cdef inline double_pair cgrad_hess_half_tweedie(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double_pair gh
	cdef double exp1, exp2
	if power == 0.:
	exp1 = exp(raw_prediction)
	gh.val1 = exp1 * (exp1 - y_true) # gradient
	gh.val2 = exp1 * (2 * exp1 - y_true) # hessian
	elif power == 1.:
	return cgrad_hess_half_poisson(y_true, raw_prediction)
	elif power == 2.:
	return cgrad_hess_half_gamma(y_true, raw_prediction)
	else:
	exp1 = exp((1. - power) * raw_prediction)
	exp2 = exp((2. - power) * raw_prediction)
	gh.val1 = exp2 - y_true * exp1 # gradient
	gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1 # hessian
	return gh


	# Half Tweedie Deviance with identity link, without dropping constant terms!
	# Therefore, best loss value is zero.
	cdef inline double closs_half_tweedie_identity(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double tmp
	if power == 0.:
	return closs_half_squared_error(y_true, raw_prediction)
	elif power == 1.:
	if y_true == 0:
	return raw_prediction
	else:
	return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
	elif power == 2.:
	return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
	else:
	tmp = pow(raw_prediction, 1. - power)
	tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
	if y_true > 0:
	tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
	return tmp


	cdef inline double cgradient_half_tweedie_identity(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	if power == 0.:
	return raw_prediction - y_true
	elif power == 1.:
	return 1. - y_true / raw_prediction
	elif power == 2.:
	return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
	else:
	return pow(raw_prediction, -power) * (raw_prediction - y_true)


	cdef inline double_pair closs_grad_half_tweedie_identity(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double_pair lg
	cdef double tmp
	if power == 0.:
	lg.val2 = raw_prediction - y_true # gradient
	lg.val1 = 0.5 * lg.val2 * lg.val2 # loss
	elif power == 1.:
	if y_true == 0:
	lg.val1 = raw_prediction
	else:
	lg.val1 = (y_true * log(y_true/raw_prediction) # loss
	+ raw_prediction - y_true)
	lg.val2 = 1. - y_true / raw_prediction # gradient
	elif power == 2.:
	lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1. # loss
	tmp = raw_prediction * raw_prediction
	lg.val2 = (raw_prediction - y_true) / tmp # gradient
	else:
	tmp = pow(raw_prediction, 1. - power)
	lg.val1 = (raw_prediction * tmp / (2. - power) # loss
	- y_true * tmp / (1. - power))
	if y_true > 0:
	lg.val1 += (pow(y_true, 2. - power)
	/ ((1. - power) * (2. - power)))
	lg.val2 = tmp * (1. - y_true / raw_prediction) # gradient
	return lg


	cdef inline double_pair cgrad_hess_half_tweedie_identity(
	double y_true,
	double raw_prediction,
	double power
	) noexcept nogil:
	cdef double_pair gh
	cdef double tmp
	if power == 0.:
	gh.val1 = raw_prediction - y_true # gradient
	gh.val2 = 1. # hessian
	elif power == 1.:
	gh.val1 = 1. - y_true / raw_prediction # gradient
	gh.val2 = y_true / (raw_prediction * raw_prediction) # hessian
	elif power == 2.:
	tmp = raw_prediction * raw_prediction
	gh.val1 = (raw_prediction - y_true) / tmp # gradient
	gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp # hessian
	else:
	tmp = pow(raw_prediction, -power)
	gh.val1 = tmp * (raw_prediction - y_true) # gradient
	gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction) # hessian
	return gh


	# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
	cdef inline double closs_half_binomial(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	# log1p(exp(raw_prediction)) - y_true * raw_prediction
	return log1pexp(raw_prediction) - y_true * raw_prediction


	cdef inline double cgradient_half_binomial(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	# gradient = y_pred - y_true = expit(raw_prediction) - y_true
	# Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
	# if raw_prediction < 0:
	# exp_tmp = exp(raw_prediction)
	# return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
	# else:
	# exp_tmp = exp(-raw_prediction)
	# return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
	# Note that optimal speed would be achieved, at the cost of precision, by
	# return expit(raw_prediction) - y_true
	# i.e. no "if else" and an own inline implementation of expit instead of
	# from scipy.special.cython_special cimport expit
	# The case distinction raw_prediction < 0 in the stable implementation does not
	# provide significant better precision apart from protecting overflow of exp(..).
	# The branch (if else), however, can incur runtime costs of up to 30%.
	# Instead, we help branch prediction by almost always ending in the first if clause
	# and making the second branch (else) a bit simpler. This has the exact same
	# precision but is faster than the stable implementation.
	# As branching criteria, we use the same cutoff as in log1pexp. Note that the
	# maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
	# (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
	cdef double exp_tmp
	if raw_prediction > -37:
	exp_tmp = exp(-raw_prediction)
	return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
	else:
	# expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
	return exp(raw_prediction) - y_true


	cdef inline double_pair closs_grad_half_binomial(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair lg
	# Same if else conditions as in log1pexp.
	if raw_prediction <= -37:
	lg.val2 = exp(raw_prediction) # used as temporary
	lg.val1 = lg.val2 - y_true * raw_prediction # loss
	lg.val2 -= y_true # gradient
	elif raw_prediction <= -2:
	lg.val2 = exp(raw_prediction) # used as temporary
	lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
	lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient
	elif raw_prediction <= 18:
	lg.val2 = exp(-raw_prediction) # used as temporary
	# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
	lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
	lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
	else:
	lg.val2 = exp(-raw_prediction) # used as temporary
	lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
	lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
	return lg


	cdef inline double_pair cgrad_hess_half_binomial(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	# with y_pred = expit(raw)
	# hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
	# = exp(-raw) / (1 + exp(-raw))**2
	cdef double_pair gh
	# See comment in cgradient_half_binomial.
	if raw_prediction > -37:
	gh.val2 = exp(-raw_prediction) # used as temporary
	gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
	gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
	else:
	gh.val2 = exp(raw_prediction) # = 1. order Taylor in exp(raw_prediction)
	gh.val1 = gh.val2 - y_true
	return gh


	# Exponential loss with (half) logit-link, aka boosting loss
	cdef inline double closs_exponential(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double tmp = exp(raw_prediction)
	return y_true / tmp + (1 - y_true) * tmp


	cdef inline double cgradient_exponential(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double tmp = exp(raw_prediction)
	return -y_true / tmp + (1 - y_true) * tmp


	cdef inline double_pair closs_grad_exponential(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	cdef double_pair lg
	lg.val2 = exp(raw_prediction) # used as temporary

	lg.val1 = y_true / lg.val2 + (1 - y_true) * lg.val2 # loss
	lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2 # gradient
	return lg


	cdef inline double_pair cgrad_hess_exponential(
	double y_true,
	double raw_prediction
	) noexcept nogil:
	# Note that hessian = loss
	cdef double_pair gh
	gh.val2 = exp(raw_prediction) # used as temporary

	gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2 # gradient
	gh.val2 = y_true / gh.val2 + (1 - y_true) * gh.val2 # hessian
	return gh


	# ---------------------------------------------------
	# Extension Types for Loss Functions of 1-dim targets
	# ---------------------------------------------------
	cdef class CyLossFunction:
	"""Base class for convex loss functions."""

	def __reduce__(self):
	return (self.__class__, ())

	cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
	"""Compute the loss for a single sample.

	Parameters
	----------
	y_true : double
	Observed, true target value.
	raw_prediction : double
	Raw prediction value (in link space).

	Returns
	-------
	double
	The loss evaluated at `y_true` and `raw_prediction`.
	"""
	pass

	cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
	"""Compute gradient of loss w.r.t. raw_prediction for a single sample.

	Parameters
	----------
	y_true : double
	Observed, true target value.
	raw_prediction : double
	Raw prediction value (in link space).

	Returns
	-------
	double
	The derivative of the loss function w.r.t. `raw_prediction`.
	"""
	pass

	cdef double_pair cy_grad_hess(
	self, double y_true, double raw_prediction
	) noexcept nogil:
	"""Compute gradient and hessian.

	Gradient and hessian of loss w.r.t. raw_prediction for a single sample.

	This is usually diagonal in raw_prediction_i and raw_prediction_j.
	Therefore, we return the diagonal element i=j.

	For a loss with a non-canonical link, this might implement the diagonal
	of the Fisher matrix (=expected hessian) instead of the hessian.

	Parameters
	----------
	y_true : double
	Observed, true target value.
	raw_prediction : double
	Raw prediction value (in link space).

	Returns
	-------
	double_pair
	Gradient and hessian of the loss function w.r.t. `raw_prediction`.
	"""
	pass

	def loss(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	int n_threads=1
	):
	"""Compute the point-wise loss value for each input.

	The point-wise loss is written to `loss_out` and no array is returned.

	Parameters
	----------
	y_true : array of shape (n_samples,)
	Observed, true target values.
	raw_prediction : array of shape (n_samples,)
	Raw prediction values (in link space).
	sample_weight : array of shape (n_samples,) or None
	Sample weights.
	loss_out : array of shape (n_samples,)
	A location into which the result is stored.
	n_threads : int
	Number of threads used by OpenMP (if any).
	"""
	pass

	def gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] gradient_out, # OUT
	int n_threads=1
	):
	"""Compute gradient of loss w.r.t raw_prediction for each input.

	The gradient is written to `gradient_out` and no array is returned.

	Parameters
	----------
	y_true : array of shape (n_samples,)
	Observed, true target values.
	raw_prediction : array of shape (n_samples,)
	Raw prediction values (in link space).
	sample_weight : array of shape (n_samples,) or None
	Sample weights.
	gradient_out : array of shape (n_samples,)
	A location into which the result is stored.
	n_threads : int
	Number of threads used by OpenMP (if any).
	"""
	pass

	def loss_gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	floating_out[::1] gradient_out, # OUT
	int n_threads=1
	):
	"""Compute loss and gradient of loss w.r.t raw_prediction.

	The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
	are returned.

	Parameters
	----------
	y_true : array of shape (n_samples,)
	Observed, true target values.
	raw_prediction : array of shape (n_samples,)
	Raw prediction values (in link space).
	sample_weight : array of shape (n_samples,) or None
	Sample weights.
	loss_out : array of shape (n_samples,) or None
	A location into which the element-wise loss is stored.
	gradient_out : array of shape (n_samples,)
	A location into which the gradient is stored.
	n_threads : int
	Number of threads used by OpenMP (if any).
	"""
	self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
	self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)

	def gradient_hessian(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] gradient_out, # OUT
	floating_out[::1] hessian_out, # OUT
	int n_threads=1
	):
	"""Compute gradient and hessian of loss w.r.t raw_prediction.

	The gradient and hessian are written to `gradient_out` and `hessian_out` and no
	arrays are returned.

	Parameters
	----------
	y_true : array of shape (n_samples,)
	Observed, true target values.
	raw_prediction : array of shape (n_samples,)
	Raw prediction values (in link space).
	sample_weight : array of shape (n_samples,) or None
	Sample weights.
	gradient_out : array of shape (n_samples,)
	A location into which the gradient is stored.
	hessian_out : array of shape (n_samples,)
	A location into which the hessian is stored.
	n_threads : int
	Number of threads used by OpenMP (if any).
	"""
	pass


	{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
	{{py:
	if param is None:
	with_param = ""
	else:
	with_param = ", self." + param
	}}

	cdef class {{name}}(CyLossFunction):
	"""{{docstring}}"""

	{{if param is not None}}
	def __init__(self, {{param}}):
	self.{{param}} = {{param}}
	{{endif}}

	{{if param is not None}}
	def __reduce__(self):
	return (self.__class__, (self.{{param}},))
	{{endif}}

	cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
	return {{closs}}(y_true, raw_prediction{{with_param}})

	cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
	return {{cgrad}}(y_true, raw_prediction{{with_param}})

	cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
	return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})

	def loss(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	int n_threads=1
	):
	cdef:
	int i
	int n_samples = y_true.shape[0]

	if sample_weight is None:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
	else:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})

	{{if closs_grad is not None}}
	def loss_gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	floating_out[::1] gradient_out, # OUT
	int n_threads=1
	):
	cdef:
	int i
	int n_samples = y_true.shape[0]
	double_pair dbl2

	if sample_weight is None:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
	loss_out[i] = dbl2.val1
	gradient_out[i] = dbl2.val2
	else:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
	loss_out[i] = sample_weight[i] * dbl2.val1
	gradient_out[i] = sample_weight[i] * dbl2.val2

	{{endif}}

	def gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] gradient_out, # OUT
	int n_threads=1
	):
	cdef:
	int i
	int n_samples = y_true.shape[0]

	if sample_weight is None:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
	else:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})

	def gradient_hessian(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] gradient_out, # OUT
	floating_out[::1] hessian_out, # OUT
	int n_threads=1
	):
	cdef:
	int i
	int n_samples = y_true.shape[0]
	double_pair dbl2

	if sample_weight is None:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
	gradient_out[i] = dbl2.val1
	hessian_out[i] = dbl2.val2
	else:
	for i in prange(
	n_samples, schedule='static', nogil=True, num_threads=n_threads
	):
	dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
	gradient_out[i] = sample_weight[i] * dbl2.val1
	hessian_out[i] = sample_weight[i] * dbl2.val2

	{{endfor}}


	# The multinomial deviance loss is also known as categorical cross-entropy or
	# multinomial log-likelihood.
	# Here, we do not inherit from CyLossFunction as its cy_gradient method deviates
	# from the API.
	cdef class CyHalfMultinomialLoss():
	"""Half Multinomial deviance loss with multinomial logit link.

	Domain:
	y_true in {0, 1, 2, 3, .., n_classes - 1}
	y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded

	Link:
	y_pred = softmax(raw_prediction)

	Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
	mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
	"""

	# Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to
	# sample-wise gradients which we provide here.
	cdef inline void cy_gradient(
	self,
	const floating_in y_true,
	const floating_in[::1] raw_prediction, # IN
	const floating_in sample_weight,
	floating_out[::1] gradient_out, # OUT
	) noexcept nogil:
	"""Compute gradient of loss w.r.t. `raw_prediction` for a single sample.

	The gradient of the multinomial logistic loss with respect to a class k,
	and for one sample is:
	grad_k = - sw * (p[k] - (y==k))

	where:
	p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction))
	sw = sample_weight

	Parameters
	----------
	y_true : double
	Observed, true target value.
	raw_prediction : array of shape (n_classes,)
	Raw prediction values (in link space).
	sample_weight : double
	Sample weight.
	gradient_out : array of shape (n_classs,)
	A location into which the gradient is stored.

	Returns
	-------
	gradient : double
	The derivative of the loss function w.r.t. `raw_prediction`.
	"""
	cdef:
	int k
	int n_classes = raw_prediction.shape[0]
	double_pair max_value_and_sum_exps
	const floating_in[:, :] raw = raw_prediction[None, :]

	max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0])
	for k in range(n_classes):
	# gradient_out[k] = p_k = y_pred_k = prob of class k
	gradient_out[k] /= max_value_and_sum_exps.val2
	# gradient_k = (p_k - (y_true == k)) * sw
	gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight

	def _test_cy_gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, ::1] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	):
	"""For testing only."""
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in [:, ::1] gradient_out
	gradient = np.empty((n_samples, n_classes), dtype=np.float64)
	gradient_out = gradient

	for i in range(n_samples):
	self.cy_gradient(
	y_true=y_true[i],
	raw_prediction=raw_prediction[i, :],
	sample_weight=1.0 if sample_weight is None else sample_weight[i],
	gradient_out=gradient_out[i, :],
	)
	return gradient

	# Note that we do not assume memory alignment/contiguity of 2d arrays.
	# There seems to be little benefit in doing so. Benchmarks proofing the
	# opposite are welcome.
	def loss(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, :] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	int n_threads=1
	):
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in max_value, sum_exps
	floating_in* p # temporary buffer
	double_pair max_value_and_sum_exps

	# We assume n_samples > n_classes. In this case having the inner loop
	# over n_classes is a good default.
	# TODO: If every memoryview is contiguous and raw_prediction is
	# f-contiguous, can we write a better algo (loops) to improve
	# performance?
	if sample_weight is None:
	# inner loop over n_classes
	with nogil, parallel(num_threads=n_threads):
	# Define private buffer variables as each thread might use its
	# own.
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	max_value = max_value_and_sum_exps.val1
	sum_exps = max_value_and_sum_exps.val2
	loss_out[i] = log(sum_exps) + max_value

	# label encoded y_true
	k = int(y_true[i])
	loss_out[i] -= raw_prediction[i, k]

	free(p)
	else:
	with nogil, parallel(num_threads=n_threads):
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	max_value = max_value_and_sum_exps.val1
	sum_exps = max_value_and_sum_exps.val2
	loss_out[i] = log(sum_exps) + max_value

	# label encoded y_true
	k = int(y_true[i])
	loss_out[i] -= raw_prediction[i, k]

	loss_out[i] *= sample_weight[i]

	free(p)

	def loss_gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, :] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[::1] loss_out, # OUT
	floating_out[:, :] gradient_out, # OUT
	int n_threads=1
	):
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in max_value, sum_exps
	floating_in* p # temporary buffer
	double_pair max_value_and_sum_exps

	if sample_weight is None:
	# inner loop over n_classes
	with nogil, parallel(num_threads=n_threads):
	# Define private buffer variables as each thread might use its
	# own.
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	max_value = max_value_and_sum_exps.val1
	sum_exps = max_value_and_sum_exps.val2
	loss_out[i] = log(sum_exps) + max_value

	for k in range(n_classes):
	# label decode y_true
	if y_true[i] == k:
	loss_out[i] -= raw_prediction[i, k]
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# gradient_k = p_k - (y_true == k)
	gradient_out[i, k] = p[k] - (y_true[i] == k)

	free(p)
	else:
	with nogil, parallel(num_threads=n_threads):
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	max_value = max_value_and_sum_exps.val1
	sum_exps = max_value_and_sum_exps.val2
	loss_out[i] = log(sum_exps) + max_value

	for k in range(n_classes):
	# label decode y_true
	if y_true[i] == k:
	loss_out[i] -= raw_prediction[i, k]
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# gradient_k = (p_k - (y_true == k)) * sw
	gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]

	loss_out[i] *= sample_weight[i]

	free(p)

	def gradient(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, :] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[:, :] gradient_out, # OUT
	int n_threads=1
	):
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in sum_exps
	floating_in* p # temporary buffer
	double_pair max_value_and_sum_exps

	if sample_weight is None:
	# inner loop over n_classes
	with nogil, parallel(num_threads=n_threads):
	# Define private buffer variables as each thread might use its
	# own.
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# gradient_k = y_pred_k - (y_true == k)
	gradient_out[i, k] = p[k] - (y_true[i] == k)

	free(p)
	else:
	with nogil, parallel(num_threads=n_threads):
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# gradient_k = (p_k - (y_true == k)) * sw
	gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]

	free(p)

	def gradient_hessian(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, :] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[:, :] gradient_out, # OUT
	floating_out[:, :] hessian_out, # OUT
	int n_threads=1
	):
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in sum_exps
	floating_in* p # temporary buffer
	double_pair max_value_and_sum_exps

	if sample_weight is None:
	# inner loop over n_classes
	with nogil, parallel(num_threads=n_threads):
	# Define private buffer variables as each thread might use its
	# own.
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# hessian_k = p_k * (1 - p_k)
	# gradient_k = p_k - (y_true == k)
	gradient_out[i, k] = p[k] - (y_true[i] == k)
	hessian_out[i, k] = p[k] * (1. - p[k])

	free(p)
	else:
	with nogil, parallel(num_threads=n_threads):
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	p[k] /= sum_exps # p_k = y_pred_k = prob of class k
	# gradient_k = (p_k - (y_true == k)) * sw
	# hessian_k = p_k * (1 - p_k) * sw
	gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
	hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]

	free(p)

	# This method simplifies the implementation of hessp in linear models,
	# i.e. the matrix-vector product of the full hessian, not only of the
	# diagonal (in the classes) approximation as implemented above.
	def gradient_proba(
	self,
	const floating_in[::1] y_true, # IN
	const floating_in[:, :] raw_prediction, # IN
	const floating_in[::1] sample_weight, # IN
	floating_out[:, :] gradient_out, # OUT
	floating_out[:, :] proba_out, # OUT
	int n_threads=1
	):
	cdef:
	int i, k
	int n_samples = y_true.shape[0]
	int n_classes = raw_prediction.shape[1]
	floating_in sum_exps
	floating_in* p # temporary buffer
	double_pair max_value_and_sum_exps

	if sample_weight is None:
	# inner loop over n_classes
	with nogil, parallel(num_threads=n_threads):
	# Define private buffer variables as each thread might use its
	# own.
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
	# gradient_k = y_pred_k - (y_true == k)
	gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)

	free(p)
	else:
	with nogil, parallel(num_threads=n_threads):
	p = <floating_in > malloc(sizeof(floating_in) (n_classes))

	for i in prange(n_samples, schedule='static'):
	max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
	sum_exps = max_value_and_sum_exps.val2

	for k in range(n_classes):
	proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
	# gradient_k = (p_k - (y_true == k)) * sw
	gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]

	free(p)