| """Solvers for Ridge and LogisticRegression using SAG algorithm""" |
|
|
| |
| |
|
|
| import warnings |
|
|
| import numpy as np |
|
|
| from ..exceptions import ConvergenceWarning |
| from ..utils import check_array |
| from ..utils.extmath import row_norms |
| from ..utils.validation import _check_sample_weight |
| from ._base import make_dataset |
| from ._sag_fast import sag32, sag64 |
|
|
|
|
| def get_auto_step_size( |
| max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False |
| ): |
| """Compute automatic step size for SAG solver. |
| |
| The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is |
| the max sum of squares for over all samples. |
| |
| Parameters |
| ---------- |
| max_squared_sum : float |
| Maximum squared sum of X over samples. |
| |
| alpha_scaled : float |
| Constant that multiplies the regularization term, scaled by |
| 1. / n_samples, the number of samples. |
| |
| loss : {'log', 'squared', 'multinomial'} |
| The loss function used in SAG solver. |
| |
| fit_intercept : bool |
| Specifies if a constant (a.k.a. bias or intercept) will be |
| added to the decision function. |
| |
| n_samples : int, default=None |
| Number of rows in X. Useful if is_saga=True. |
| |
| is_saga : bool, default=False |
| Whether to return step size for the SAGA algorithm or the SAG |
| algorithm. |
| |
| Returns |
| ------- |
| step_size : float |
| Step size used in SAG solver. |
| |
| References |
| ---------- |
| Schmidt, M., Roux, N. L., & Bach, F. (2013). |
| Minimizing finite sums with the stochastic average gradient |
| https://hal.inria.fr/hal-00860051/document |
| |
| :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). |
| "SAGA: A Fast Incremental Gradient Method With Support |
| for Non-Strongly Convex Composite Objectives" <1407.0202>` |
| """ |
| if loss in ("log", "multinomial"): |
| L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled |
| elif loss == "squared": |
| |
| L = max_squared_sum + int(fit_intercept) + alpha_scaled |
| else: |
| raise ValueError( |
| "Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'" |
| % loss |
| ) |
| if is_saga: |
| |
| |
| mun = min(2 * n_samples * alpha_scaled, L) |
| step = 1.0 / (2 * L + mun) |
| else: |
| |
| |
| |
| step = 1.0 / L |
| return step |
|
|
|
|
| def sag_solver( |
| X, |
| y, |
| sample_weight=None, |
| loss="log", |
| alpha=1.0, |
| beta=0.0, |
| max_iter=1000, |
| tol=0.001, |
| verbose=0, |
| random_state=None, |
| check_input=True, |
| max_squared_sum=None, |
| warm_start_mem=None, |
| is_saga=False, |
| ): |
| """SAG solver for Ridge and LogisticRegression. |
| |
| SAG stands for Stochastic Average Gradient: the gradient of the loss is |
| estimated each sample at a time and the model is updated along the way with |
| a constant learning rate. |
| |
| IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the |
| same scale. You can normalize the data by using |
| sklearn.preprocessing.StandardScaler on your data before passing it to the |
| fit method. |
| |
| This implementation works with data represented as dense numpy arrays or |
| sparse scipy arrays of floating point values for the features. It will |
| fit the data according to squared loss or log loss. |
| |
| The regularizer is a penalty added to the loss function that shrinks model |
| parameters towards the zero vector using the squared euclidean norm L2. |
| |
| .. versionadded:: 0.17 |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training data. |
| |
| y : ndarray of shape (n_samples,) |
| Target values. With loss='multinomial', y must be label encoded |
| (see preprocessing.LabelEncoder). For loss='log' it must be in [0, 1]. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Weights applied to individual samples (1. for unweighted). |
| |
| loss : {'log', 'squared', 'multinomial'}, default='log' |
| Loss function that will be optimized: |
| -'log' is the binary logistic loss, as used in LogisticRegression. |
| -'squared' is the squared loss, as used in Ridge. |
| -'multinomial' is the multinomial logistic loss, as used in |
| LogisticRegression. |
| |
| .. versionadded:: 0.18 |
| *loss='multinomial'* |
| |
| alpha : float, default=1. |
| L2 regularization term in the objective function |
| ``(0.5 * alpha * || W ||_F^2)``. |
| |
| beta : float, default=0. |
| L1 regularization term in the objective function |
| ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True. |
| |
| max_iter : int, default=1000 |
| The max number of passes over the training data if the stopping |
| criteria is not reached. |
| |
| tol : float, default=0.001 |
| The stopping criteria for the weights. The iterations will stop when |
| max(change in weights) / max(weights) < tol. |
| |
| verbose : int, default=0 |
| The verbosity level. |
| |
| random_state : int, RandomState instance or None, default=None |
| Used when shuffling the data. Pass an int for reproducible output |
| across multiple function calls. |
| See :term:`Glossary <random_state>`. |
| |
| check_input : bool, default=True |
| If False, the input arrays X and y will not be checked. |
| |
| max_squared_sum : float, default=None |
| Maximum squared sum of X over samples. If None, it will be computed, |
| going through all the samples. The value should be precomputed |
| to speed up cross validation. |
| |
| warm_start_mem : dict, default=None |
| The initialization parameters used for warm starting. Warm starting is |
| currently used in LogisticRegression but not in Ridge. |
| It contains: |
| - 'coef': the weight vector, with the intercept in last line |
| if the intercept is fitted. |
| - 'gradient_memory': the scalar gradient for all seen samples. |
| - 'sum_gradient': the sum of gradient over all seen samples, |
| for each feature. |
| - 'intercept_sum_gradient': the sum of gradient over all seen |
| samples, for the intercept. |
| - 'seen': array of boolean describing the seen samples. |
| - 'num_seen': the number of seen samples. |
| |
| is_saga : bool, default=False |
| Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves |
| better in the first epochs, and allow for l1 regularisation. |
| |
| Returns |
| ------- |
| coef_ : ndarray of shape (n_features,) |
| Weight vector. |
| |
| n_iter_ : int |
| The number of full pass on all samples. |
| |
| warm_start_mem : dict |
| Contains a 'coef' key with the fitted result, and possibly the |
| fitted intercept at the end of the array. Contains also other keys |
| used for warm starting. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn import linear_model |
| >>> n_samples, n_features = 10, 5 |
| >>> rng = np.random.RandomState(0) |
| >>> X = rng.randn(n_samples, n_features) |
| >>> y = rng.randn(n_samples) |
| >>> clf = linear_model.Ridge(solver='sag') |
| >>> clf.fit(X, y) |
| Ridge(solver='sag') |
| |
| >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) |
| >>> y = np.array([1, 1, 2, 2]) |
| >>> clf = linear_model.LogisticRegression(solver='sag') |
| >>> clf.fit(X, y) |
| LogisticRegression(solver='sag') |
| |
| References |
| ---------- |
| Schmidt, M., Roux, N. L., & Bach, F. (2013). |
| Minimizing finite sums with the stochastic average gradient |
| https://hal.inria.fr/hal-00860051/document |
| |
| :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). |
| "SAGA: A Fast Incremental Gradient Method With Support |
| for Non-Strongly Convex Composite Objectives" <1407.0202>` |
| |
| See Also |
| -------- |
| Ridge, SGDRegressor, ElasticNet, Lasso, SVR, |
| LogisticRegression, SGDClassifier, LinearSVC, Perceptron |
| """ |
| if warm_start_mem is None: |
| warm_start_mem = {} |
| |
| if max_iter is None: |
| max_iter = 1000 |
|
|
| if check_input: |
| _dtype = [np.float64, np.float32] |
| X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C") |
| y = check_array(y, dtype=_dtype, ensure_2d=False, order="C") |
|
|
| n_samples, n_features = X.shape[0], X.shape[1] |
| |
| alpha_scaled = float(alpha) / n_samples |
| beta_scaled = float(beta) / n_samples |
|
|
| |
| n_classes = int(y.max()) + 1 if loss == "multinomial" else 1 |
|
|
| |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
|
|
| if "coef" in warm_start_mem.keys(): |
| coef_init = warm_start_mem["coef"] |
| else: |
| |
| coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") |
|
|
| |
| |
| fit_intercept = coef_init.shape[0] == (n_features + 1) |
| if fit_intercept: |
| intercept_init = coef_init[-1, :] |
| coef_init = coef_init[:-1, :] |
| else: |
| intercept_init = np.zeros(n_classes, dtype=X.dtype) |
|
|
| if "intercept_sum_gradient" in warm_start_mem.keys(): |
| intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"] |
| else: |
| intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype) |
|
|
| if "gradient_memory" in warm_start_mem.keys(): |
| gradient_memory_init = warm_start_mem["gradient_memory"] |
| else: |
| gradient_memory_init = np.zeros( |
| (n_samples, n_classes), dtype=X.dtype, order="C" |
| ) |
| if "sum_gradient" in warm_start_mem.keys(): |
| sum_gradient_init = warm_start_mem["sum_gradient"] |
| else: |
| sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") |
|
|
| if "seen" in warm_start_mem.keys(): |
| seen_init = warm_start_mem["seen"] |
| else: |
| seen_init = np.zeros(n_samples, dtype=np.int32, order="C") |
|
|
| if "num_seen" in warm_start_mem.keys(): |
| num_seen_init = warm_start_mem["num_seen"] |
| else: |
| num_seen_init = 0 |
|
|
| dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state) |
|
|
| if max_squared_sum is None: |
| max_squared_sum = row_norms(X, squared=True).max() |
| step_size = get_auto_step_size( |
| max_squared_sum, |
| alpha_scaled, |
| loss, |
| fit_intercept, |
| n_samples=n_samples, |
| is_saga=is_saga, |
| ) |
| if step_size * alpha_scaled == 1: |
| raise ZeroDivisionError( |
| "Current sag implementation does not handle " |
| "the case step_size * alpha_scaled == 1" |
| ) |
|
|
| sag = sag64 if X.dtype == np.float64 else sag32 |
| num_seen, n_iter_ = sag( |
| dataset, |
| coef_init, |
| intercept_init, |
| n_samples, |
| n_features, |
| n_classes, |
| tol, |
| max_iter, |
| loss, |
| step_size, |
| alpha_scaled, |
| beta_scaled, |
| sum_gradient_init, |
| gradient_memory_init, |
| seen_init, |
| num_seen_init, |
| fit_intercept, |
| intercept_sum_gradient, |
| intercept_decay, |
| is_saga, |
| verbose, |
| ) |
|
|
| if n_iter_ == max_iter: |
| warnings.warn( |
| "The max_iter was reached which means the coef_ did not converge", |
| ConvergenceWarning, |
| ) |
|
|
| if fit_intercept: |
| coef_init = np.vstack((coef_init, intercept_init)) |
|
|
| warm_start_mem = { |
| "coef": coef_init, |
| "sum_gradient": sum_gradient_init, |
| "intercept_sum_gradient": intercept_sum_gradient, |
| "gradient_memory": gradient_memory_init, |
| "seen": seen_init, |
| "num_seen": num_seen, |
| } |
|
|
| if loss == "multinomial": |
| coef_ = coef_init.T |
| else: |
| coef_ = coef_init[:, 0] |
|
|
| return coef_, n_iter_, warm_start_mem |
|
|