| """ |
| Forest of trees-based ensemble methods. |
| |
| Those methods include random forests and extremely randomized trees. |
| |
| The module structure is the following: |
| |
| - The ``BaseForest`` base class implements a common ``fit`` method for all |
| the estimators in the module. The ``fit`` method of the base ``Forest`` |
| class calls the ``fit`` method of each sub-estimator on random samples |
| (with replacement, a.k.a. bootstrap) of the training set. |
| |
| The init of the sub-estimator is further delegated to the |
| ``BaseEnsemble`` constructor. |
| |
| - The ``ForestClassifier`` and ``ForestRegressor`` base classes further |
| implement the prediction logic by computing an average of the predicted |
| outcomes of the sub-estimators. |
| |
| - The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived |
| classes provide the user with concrete implementations of |
| the forest ensemble method using classical, deterministic |
| ``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as |
| sub-estimator implementations. |
| |
| - The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived |
| classes provide the user with concrete implementations of the |
| forest ensemble method using the extremely randomized trees |
| ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as |
| sub-estimator implementations. |
| |
| Single and multi-output problems are both handled. |
| """ |
|
|
| |
| |
|
|
|
|
| import threading |
| from abc import ABCMeta, abstractmethod |
| from numbers import Integral, Real |
| from warnings import catch_warnings, simplefilter, warn |
|
|
| import numpy as np |
| from scipy.sparse import hstack as sparse_hstack |
| from scipy.sparse import issparse |
|
|
| from ..base import ( |
| ClassifierMixin, |
| MultiOutputMixin, |
| RegressorMixin, |
| TransformerMixin, |
| _fit_context, |
| is_classifier, |
| ) |
| from ..exceptions import DataConversionWarning |
| from ..metrics import accuracy_score, r2_score |
| from ..preprocessing import OneHotEncoder |
| from ..tree import ( |
| BaseDecisionTree, |
| DecisionTreeClassifier, |
| DecisionTreeRegressor, |
| ExtraTreeClassifier, |
| ExtraTreeRegressor, |
| ) |
| from ..tree._tree import DOUBLE, DTYPE |
| from ..utils import check_random_state, compute_sample_weight |
| from ..utils._param_validation import Interval, RealNotInt, StrOptions |
| from ..utils._tags import get_tags |
| from ..utils.multiclass import check_classification_targets, type_of_target |
| from ..utils.parallel import Parallel, delayed |
| from ..utils.validation import ( |
| _check_feature_names_in, |
| _check_sample_weight, |
| _num_samples, |
| check_is_fitted, |
| validate_data, |
| ) |
| from ._base import BaseEnsemble, _partition_estimators |
|
|
| __all__ = [ |
| "RandomForestClassifier", |
| "RandomForestRegressor", |
| "ExtraTreesClassifier", |
| "ExtraTreesRegressor", |
| "RandomTreesEmbedding", |
| ] |
|
|
| MAX_INT = np.iinfo(np.int32).max |
|
|
|
|
| def _get_n_samples_bootstrap(n_samples, max_samples): |
| """ |
| Get the number of samples in a bootstrap sample. |
| |
| Parameters |
| ---------- |
| n_samples : int |
| Number of samples in the dataset. |
| max_samples : int or float |
| The maximum number of samples to draw from the total available: |
| - if float, this indicates a fraction of the total and should be |
| the interval `(0.0, 1.0]`; |
| - if int, this indicates the exact number of samples; |
| - if None, this indicates the total number of samples. |
| |
| Returns |
| ------- |
| n_samples_bootstrap : int |
| The total number of samples to draw for the bootstrap sample. |
| """ |
| if max_samples is None: |
| return n_samples |
|
|
| if isinstance(max_samples, Integral): |
| if max_samples > n_samples: |
| msg = "`max_samples` must be <= n_samples={} but got value {}" |
| raise ValueError(msg.format(n_samples, max_samples)) |
| return max_samples |
|
|
| if isinstance(max_samples, Real): |
| return max(round(n_samples * max_samples), 1) |
|
|
|
|
| def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): |
| """ |
| Private function used to _parallel_build_trees function.""" |
|
|
| random_instance = check_random_state(random_state) |
| sample_indices = random_instance.randint( |
| 0, n_samples, n_samples_bootstrap, dtype=np.int32 |
| ) |
|
|
| return sample_indices |
|
|
|
|
| def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): |
| """ |
| Private function used to forest._set_oob_score function.""" |
| sample_indices = _generate_sample_indices( |
| random_state, n_samples, n_samples_bootstrap |
| ) |
| sample_counts = np.bincount(sample_indices, minlength=n_samples) |
| unsampled_mask = sample_counts == 0 |
| indices_range = np.arange(n_samples) |
| unsampled_indices = indices_range[unsampled_mask] |
|
|
| return unsampled_indices |
|
|
|
|
| def _parallel_build_trees( |
| tree, |
| bootstrap, |
| X, |
| y, |
| sample_weight, |
| tree_idx, |
| n_trees, |
| verbose=0, |
| class_weight=None, |
| n_samples_bootstrap=None, |
| missing_values_in_feature_mask=None, |
| ): |
| """ |
| Private function used to fit a single tree in parallel.""" |
| if verbose > 1: |
| print("building tree %d of %d" % (tree_idx + 1, n_trees)) |
|
|
| if bootstrap: |
| n_samples = X.shape[0] |
| if sample_weight is None: |
| curr_sample_weight = np.ones((n_samples,), dtype=np.float64) |
| else: |
| curr_sample_weight = sample_weight.copy() |
|
|
| indices = _generate_sample_indices( |
| tree.random_state, n_samples, n_samples_bootstrap |
| ) |
| sample_counts = np.bincount(indices, minlength=n_samples) |
| curr_sample_weight *= sample_counts |
|
|
| if class_weight == "subsample": |
| with catch_warnings(): |
| simplefilter("ignore", DeprecationWarning) |
| curr_sample_weight *= compute_sample_weight("auto", y, indices=indices) |
| elif class_weight == "balanced_subsample": |
| curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) |
|
|
| tree._fit( |
| X, |
| y, |
| sample_weight=curr_sample_weight, |
| check_input=False, |
| missing_values_in_feature_mask=missing_values_in_feature_mask, |
| ) |
| else: |
| tree._fit( |
| X, |
| y, |
| sample_weight=sample_weight, |
| check_input=False, |
| missing_values_in_feature_mask=missing_values_in_feature_mask, |
| ) |
|
|
| return tree |
|
|
|
|
| class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): |
| """ |
| Base class for forests of trees. |
| |
| Warning: This class should not be used directly. Use derived classes |
| instead. |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_estimators": [Interval(Integral, 1, None, closed="left")], |
| "bootstrap": ["boolean"], |
| "oob_score": ["boolean", callable], |
| "n_jobs": [Integral, None], |
| "random_state": ["random_state"], |
| "verbose": ["verbose"], |
| "warm_start": ["boolean"], |
| "max_samples": [ |
| None, |
| Interval(RealNotInt, 0.0, 1.0, closed="right"), |
| Interval(Integral, 1, None, closed="left"), |
| ], |
| } |
|
|
| @abstractmethod |
| def __init__( |
| self, |
| estimator, |
| n_estimators=100, |
| *, |
| estimator_params=tuple(), |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| class_weight=None, |
| max_samples=None, |
| ): |
| super().__init__( |
| estimator=estimator, |
| n_estimators=n_estimators, |
| estimator_params=estimator_params, |
| ) |
|
|
| self.bootstrap = bootstrap |
| self.oob_score = oob_score |
| self.n_jobs = n_jobs |
| self.random_state = random_state |
| self.verbose = verbose |
| self.warm_start = warm_start |
| self.class_weight = class_weight |
| self.max_samples = max_samples |
|
|
| def apply(self, X): |
| """ |
| Apply trees in the forest to X, return leaf indices. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| X_leaves : ndarray of shape (n_samples, n_estimators) |
| For each datapoint x in X and for each tree in the forest, |
| return the index of the leaf x ends up in. |
| """ |
| X = self._validate_X_predict(X) |
| results = Parallel( |
| n_jobs=self.n_jobs, |
| verbose=self.verbose, |
| prefer="threads", |
| )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_) |
|
|
| return np.array(results).T |
|
|
| def decision_path(self, X): |
| """ |
| Return the decision path in the forest. |
| |
| .. versionadded:: 0.18 |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| indicator : sparse matrix of shape (n_samples, n_nodes) |
| Return a node indicator matrix where non zero elements indicates |
| that the samples goes through the nodes. The matrix is of CSR |
| format. |
| |
| n_nodes_ptr : ndarray of shape (n_estimators + 1,) |
| The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] |
| gives the indicator value for the i-th estimator. |
| """ |
| X = self._validate_X_predict(X) |
| indicators = Parallel( |
| n_jobs=self.n_jobs, |
| verbose=self.verbose, |
| prefer="threads", |
| )( |
| delayed(tree.decision_path)(X, check_input=False) |
| for tree in self.estimators_ |
| ) |
|
|
| n_nodes = [0] |
| n_nodes.extend([i.shape[1] for i in indicators]) |
| n_nodes_ptr = np.array(n_nodes).cumsum() |
|
|
| return sparse_hstack(indicators).tocsr(), n_nodes_ptr |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y, sample_weight=None): |
| """ |
| Build a forest of trees from the training set (X, y). |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The training input samples. Internally, its dtype will be converted |
| to ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csc_matrix``. |
| |
| y : array-like of shape (n_samples,) or (n_samples, n_outputs) |
| The target values (class labels in classification, real numbers in |
| regression). |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If None, then samples are equally weighted. Splits |
| that would create child nodes with net zero or negative weight are |
| ignored while searching for a split in each node. In the case of |
| classification, splits are also ignored if they would result in any |
| single class carrying a negative weight in either child node. |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
| |
| if issparse(y): |
| raise ValueError("sparse multilabel-indicator for y is not supported.") |
|
|
| X, y = validate_data( |
| self, |
| X, |
| y, |
| multi_output=True, |
| accept_sparse="csc", |
| dtype=DTYPE, |
| ensure_all_finite=False, |
| ) |
| |
| |
| |
| |
| estimator = type(self.estimator)(criterion=self.criterion) |
| missing_values_in_feature_mask = ( |
| estimator._compute_missing_values_in_feature_mask( |
| X, estimator_name=self.__class__.__name__ |
| ) |
| ) |
|
|
| if sample_weight is not None: |
| sample_weight = _check_sample_weight(sample_weight, X) |
|
|
| if issparse(X): |
| |
| |
| X.sort_indices() |
|
|
| y = np.atleast_1d(y) |
| if y.ndim == 2 and y.shape[1] == 1: |
| warn( |
| ( |
| "A column-vector y was passed when a 1d array was" |
| " expected. Please change the shape of y to " |
| "(n_samples,), for example using ravel()." |
| ), |
| DataConversionWarning, |
| stacklevel=2, |
| ) |
|
|
| if y.ndim == 1: |
| |
| |
| y = np.reshape(y, (-1, 1)) |
|
|
| if self.criterion == "poisson": |
| if np.any(y < 0): |
| raise ValueError( |
| "Some value(s) of y are negative which is " |
| "not allowed for Poisson regression." |
| ) |
| if np.sum(y) <= 0: |
| raise ValueError( |
| "Sum of y is not strictly positive which " |
| "is necessary for Poisson regression." |
| ) |
|
|
| self._n_samples, self.n_outputs_ = y.shape |
|
|
| y, expanded_class_weight = self._validate_y_class_weight(y) |
|
|
| if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: |
| y = np.ascontiguousarray(y, dtype=DOUBLE) |
|
|
| if expanded_class_weight is not None: |
| if sample_weight is not None: |
| sample_weight = sample_weight * expanded_class_weight |
| else: |
| sample_weight = expanded_class_weight |
|
|
| if not self.bootstrap and self.max_samples is not None: |
| raise ValueError( |
| "`max_sample` cannot be set if `bootstrap=False`. " |
| "Either switch to `bootstrap=True` or set " |
| "`max_sample=None`." |
| ) |
| elif self.bootstrap: |
| n_samples_bootstrap = _get_n_samples_bootstrap( |
| n_samples=X.shape[0], max_samples=self.max_samples |
| ) |
| else: |
| n_samples_bootstrap = None |
|
|
| self._n_samples_bootstrap = n_samples_bootstrap |
|
|
| self._validate_estimator() |
|
|
| if not self.bootstrap and self.oob_score: |
| raise ValueError("Out of bag estimation only available if bootstrap=True") |
|
|
| random_state = check_random_state(self.random_state) |
|
|
| if not self.warm_start or not hasattr(self, "estimators_"): |
| |
| self.estimators_ = [] |
|
|
| n_more_estimators = self.n_estimators - len(self.estimators_) |
|
|
| if n_more_estimators < 0: |
| raise ValueError( |
| "n_estimators=%d must be larger or equal to " |
| "len(estimators_)=%d when warm_start==True" |
| % (self.n_estimators, len(self.estimators_)) |
| ) |
|
|
| elif n_more_estimators == 0: |
| warn( |
| "Warm-start fitting without increasing n_estimators does not " |
| "fit new trees." |
| ) |
| else: |
| if self.warm_start and len(self.estimators_) > 0: |
| |
| |
| random_state.randint(MAX_INT, size=len(self.estimators_)) |
|
|
| trees = [ |
| self._make_estimator(append=False, random_state=random_state) |
| for i in range(n_more_estimators) |
| ] |
|
|
| |
| |
| |
| |
| |
| |
| trees = Parallel( |
| n_jobs=self.n_jobs, |
| verbose=self.verbose, |
| prefer="threads", |
| )( |
| delayed(_parallel_build_trees)( |
| t, |
| self.bootstrap, |
| X, |
| y, |
| sample_weight, |
| i, |
| len(trees), |
| verbose=self.verbose, |
| class_weight=self.class_weight, |
| n_samples_bootstrap=n_samples_bootstrap, |
| missing_values_in_feature_mask=missing_values_in_feature_mask, |
| ) |
| for i, t in enumerate(trees) |
| ) |
|
|
| |
| self.estimators_.extend(trees) |
|
|
| if self.oob_score and ( |
| n_more_estimators > 0 or not hasattr(self, "oob_score_") |
| ): |
| y_type = type_of_target(y) |
| if y_type == "unknown" or ( |
| is_classifier(self) and y_type == "multiclass-multioutput" |
| ): |
| |
| |
| |
| |
| raise ValueError( |
| "The type of target cannot be used to compute OOB " |
| f"estimates. Got {y_type} while only the following are " |
| "supported: continuous, continuous-multioutput, binary, " |
| "multiclass, multilabel-indicator." |
| ) |
|
|
| if callable(self.oob_score): |
| self._set_oob_score_and_attributes( |
| X, y, scoring_function=self.oob_score |
| ) |
| else: |
| self._set_oob_score_and_attributes(X, y) |
|
|
| |
| if hasattr(self, "classes_") and self.n_outputs_ == 1: |
| self.n_classes_ = self.n_classes_[0] |
| self.classes_ = self.classes_[0] |
|
|
| return self |
|
|
| @abstractmethod |
| def _set_oob_score_and_attributes(self, X, y, scoring_function=None): |
| """Compute and set the OOB score and attributes. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data matrix. |
| y : ndarray of shape (n_samples, n_outputs) |
| The target matrix. |
| scoring_function : callable, default=None |
| Scoring function for OOB score. Default depends on whether |
| this is a regression (R2 score) or classification problem |
| (accuracy score). |
| """ |
|
|
| def _compute_oob_predictions(self, X, y): |
| """Compute and set the OOB score. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data matrix. |
| y : ndarray of shape (n_samples, n_outputs) |
| The target matrix. |
| |
| Returns |
| ------- |
| oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ |
| (n_samples, 1, n_outputs) |
| The OOB predictions. |
| """ |
| |
| if issparse(X): |
| X = X.tocsr() |
|
|
| n_samples = y.shape[0] |
| n_outputs = self.n_outputs_ |
| if is_classifier(self) and hasattr(self, "n_classes_"): |
| |
| |
| |
| oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs) |
| else: |
| |
| |
| |
| oob_pred_shape = (n_samples, 1, n_outputs) |
|
|
| oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) |
| n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) |
|
|
| n_samples_bootstrap = _get_n_samples_bootstrap( |
| n_samples, |
| self.max_samples, |
| ) |
| for estimator in self.estimators_: |
| unsampled_indices = _generate_unsampled_indices( |
| estimator.random_state, |
| n_samples, |
| n_samples_bootstrap, |
| ) |
|
|
| y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :]) |
| oob_pred[unsampled_indices, ...] += y_pred |
| n_oob_pred[unsampled_indices, :] += 1 |
|
|
| for k in range(n_outputs): |
| if (n_oob_pred == 0).any(): |
| warn( |
| ( |
| "Some inputs do not have OOB scores. This probably means " |
| "too few trees were used to compute any reliable OOB " |
| "estimates." |
| ), |
| UserWarning, |
| ) |
| n_oob_pred[n_oob_pred == 0] = 1 |
| oob_pred[..., k] /= n_oob_pred[..., [k]] |
|
|
| return oob_pred |
|
|
| def _validate_y_class_weight(self, y): |
| |
| return y, None |
|
|
| def _validate_X_predict(self, X): |
| """ |
| Validate X whenever one tries to predict, apply, predict_proba.""" |
| check_is_fitted(self) |
| if self.estimators_[0]._support_missing_values(X): |
| ensure_all_finite = "allow-nan" |
| else: |
| ensure_all_finite = True |
|
|
| X = validate_data( |
| self, |
| X, |
| dtype=DTYPE, |
| accept_sparse="csr", |
| reset=False, |
| ensure_all_finite=ensure_all_finite, |
| ) |
| if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): |
| raise ValueError("No support for np.int64 index based sparse matrices") |
| return X |
|
|
| @property |
| def feature_importances_(self): |
| """ |
| The impurity-based feature importances. |
| |
| The higher, the more important the feature. |
| The importance of a feature is computed as the (normalized) |
| total reduction of the criterion brought by that feature. It is also |
| known as the Gini importance. |
| |
| Warning: impurity-based feature importances can be misleading for |
| high cardinality features (many unique values). See |
| :func:`sklearn.inspection.permutation_importance` as an alternative. |
| |
| Returns |
| ------- |
| feature_importances_ : ndarray of shape (n_features,) |
| The values of this array sum to 1, unless all trees are single node |
| trees consisting of only the root node, in which case it will be an |
| array of zeros. |
| """ |
| check_is_fitted(self) |
|
|
| all_importances = Parallel(n_jobs=self.n_jobs, prefer="threads")( |
| delayed(getattr)(tree, "feature_importances_") |
| for tree in self.estimators_ |
| if tree.tree_.node_count > 1 |
| ) |
|
|
| if not all_importances: |
| return np.zeros(self.n_features_in_, dtype=np.float64) |
|
|
| all_importances = np.mean(all_importances, axis=0, dtype=np.float64) |
| return all_importances / np.sum(all_importances) |
|
|
| def _get_estimators_indices(self): |
| |
| for tree in self.estimators_: |
| if not self.bootstrap: |
| yield np.arange(self._n_samples, dtype=np.int32) |
| else: |
| |
| |
| |
| seed = tree.random_state |
| |
| |
| yield _generate_sample_indices( |
| seed, self._n_samples, self._n_samples_bootstrap |
| ) |
|
|
| @property |
| def estimators_samples_(self): |
| """The subset of drawn samples for each base estimator. |
| |
| Returns a dynamically generated list of indices identifying |
| the samples used for fitting each member of the ensemble, i.e., |
| the in-bag samples. |
| |
| Note: the list is re-created at each call to the property in order |
| to reduce the object memory footprint by not storing the sampling |
| data. Thus fetching the property may be slower than expected. |
| """ |
| return [sample_indices for sample_indices in self._get_estimators_indices()] |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| |
| |
| estimator = type(self.estimator)(criterion=self.criterion) |
| tags.input_tags.allow_nan = get_tags(estimator).input_tags.allow_nan |
| return tags |
|
|
|
|
| def _accumulate_prediction(predict, X, out, lock): |
| """ |
| This is a utility function for joblib's Parallel. |
| |
| It can't go locally in ForestClassifier or ForestRegressor, because joblib |
| complains that it cannot pickle it when placed there. |
| """ |
| prediction = predict(X, check_input=False) |
| with lock: |
| if len(out) == 1: |
| out[0] += prediction |
| else: |
| for i in range(len(out)): |
| out[i] += prediction[i] |
|
|
|
|
| class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): |
| """ |
| Base class for forest of trees-based classifiers. |
| |
| Warning: This class should not be used directly. Use derived classes |
| instead. |
| """ |
|
|
| @abstractmethod |
| def __init__( |
| self, |
| estimator, |
| n_estimators=100, |
| *, |
| estimator_params=tuple(), |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| class_weight=None, |
| max_samples=None, |
| ): |
| super().__init__( |
| estimator=estimator, |
| n_estimators=n_estimators, |
| estimator_params=estimator_params, |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| class_weight=class_weight, |
| max_samples=max_samples, |
| ) |
|
|
| @staticmethod |
| def _get_oob_predictions(tree, X): |
| """Compute the OOB predictions for an individual tree. |
| |
| Parameters |
| ---------- |
| tree : DecisionTreeClassifier object |
| A single decision tree classifier. |
| X : ndarray of shape (n_samples, n_features) |
| The OOB samples. |
| |
| Returns |
| ------- |
| y_pred : ndarray of shape (n_samples, n_classes, n_outputs) |
| The OOB associated predictions. |
| """ |
| y_pred = tree.predict_proba(X, check_input=False) |
| y_pred = np.asarray(y_pred) |
| if y_pred.ndim == 2: |
| |
| y_pred = y_pred[..., np.newaxis] |
| else: |
| |
| |
| |
| y_pred = np.rollaxis(y_pred, axis=0, start=3) |
| return y_pred |
|
|
| def _set_oob_score_and_attributes(self, X, y, scoring_function=None): |
| """Compute and set the OOB score and attributes. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data matrix. |
| y : ndarray of shape (n_samples, n_outputs) |
| The target matrix. |
| scoring_function : callable, default=None |
| Scoring function for OOB score. Defaults to `accuracy_score`. |
| """ |
| self.oob_decision_function_ = super()._compute_oob_predictions(X, y) |
| if self.oob_decision_function_.shape[-1] == 1: |
| |
| self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1) |
|
|
| if scoring_function is None: |
| scoring_function = accuracy_score |
|
|
| self.oob_score_ = scoring_function( |
| y, np.argmax(self.oob_decision_function_, axis=1) |
| ) |
|
|
| def _validate_y_class_weight(self, y): |
| check_classification_targets(y) |
|
|
| y = np.copy(y) |
| expanded_class_weight = None |
|
|
| if self.class_weight is not None: |
| y_original = np.copy(y) |
|
|
| self.classes_ = [] |
| self.n_classes_ = [] |
|
|
| y_store_unique_indices = np.zeros(y.shape, dtype=int) |
| for k in range(self.n_outputs_): |
| classes_k, y_store_unique_indices[:, k] = np.unique( |
| y[:, k], return_inverse=True |
| ) |
| self.classes_.append(classes_k) |
| self.n_classes_.append(classes_k.shape[0]) |
| y = y_store_unique_indices |
|
|
| if self.class_weight is not None: |
| valid_presets = ("balanced", "balanced_subsample") |
| if isinstance(self.class_weight, str): |
| if self.class_weight not in valid_presets: |
| raise ValueError( |
| "Valid presets for class_weight include " |
| '"balanced" and "balanced_subsample".' |
| 'Given "%s".' % self.class_weight |
| ) |
| if self.warm_start: |
| warn( |
| 'class_weight presets "balanced" or ' |
| '"balanced_subsample" are ' |
| "not recommended for warm_start if the fitted data " |
| "differs from the full dataset. In order to use " |
| '"balanced" weights, use compute_class_weight ' |
| '("balanced", classes, y). In place of y you can use ' |
| "a large enough sample of the full training set " |
| "target to properly estimate the class frequency " |
| "distributions. Pass the resulting weights as the " |
| "class_weight parameter." |
| ) |
|
|
| if self.class_weight != "balanced_subsample" or not self.bootstrap: |
| if self.class_weight == "balanced_subsample": |
| class_weight = "balanced" |
| else: |
| class_weight = self.class_weight |
| expanded_class_weight = compute_sample_weight(class_weight, y_original) |
|
|
| return y, expanded_class_weight |
|
|
| def predict(self, X): |
| """ |
| Predict class for X. |
| |
| The predicted class of an input sample is a vote by the trees in |
| the forest, weighted by their probability estimates. That is, |
| the predicted class is the one with highest mean probability |
| estimate across the trees. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| y : ndarray of shape (n_samples,) or (n_samples, n_outputs) |
| The predicted classes. |
| """ |
| proba = self.predict_proba(X) |
|
|
| if self.n_outputs_ == 1: |
| return self.classes_.take(np.argmax(proba, axis=1), axis=0) |
|
|
| else: |
| n_samples = proba[0].shape[0] |
| |
| class_type = self.classes_[0].dtype |
| predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type) |
|
|
| for k in range(self.n_outputs_): |
| predictions[:, k] = self.classes_[k].take( |
| np.argmax(proba[k], axis=1), axis=0 |
| ) |
|
|
| return predictions |
|
|
| def predict_proba(self, X): |
| """ |
| Predict class probabilities for X. |
| |
| The predicted class probabilities of an input sample are computed as |
| the mean predicted class probabilities of the trees in the forest. |
| The class probability of a single tree is the fraction of samples of |
| the same class in a leaf. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| p : ndarray of shape (n_samples, n_classes), or a list of such arrays |
| The class probabilities of the input samples. The order of the |
| classes corresponds to that in the attribute :term:`classes_`. |
| """ |
| check_is_fitted(self) |
| |
| X = self._validate_X_predict(X) |
|
|
| |
| n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) |
|
|
| |
| all_proba = [ |
| np.zeros((X.shape[0], j), dtype=np.float64) |
| for j in np.atleast_1d(self.n_classes_) |
| ] |
| lock = threading.Lock() |
| Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( |
| delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock) |
| for e in self.estimators_ |
| ) |
|
|
| for proba in all_proba: |
| proba /= len(self.estimators_) |
|
|
| if len(all_proba) == 1: |
| return all_proba[0] |
| else: |
| return all_proba |
|
|
| def predict_log_proba(self, X): |
| """ |
| Predict class log-probabilities for X. |
| |
| The predicted class log-probabilities of an input sample is computed as |
| the log of the mean predicted class probabilities of the trees in the |
| forest. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| p : ndarray of shape (n_samples, n_classes), or a list of such arrays |
| The class probabilities of the input samples. The order of the |
| classes corresponds to that in the attribute :term:`classes_`. |
| """ |
| proba = self.predict_proba(X) |
|
|
| if self.n_outputs_ == 1: |
| return np.log(proba) |
|
|
| else: |
| for k in range(self.n_outputs_): |
| proba[k] = np.log(proba[k]) |
|
|
| return proba |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.classifier_tags.multi_label = True |
| tags.input_tags.sparse = True |
| return tags |
|
|
|
|
| class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): |
| """ |
| Base class for forest of trees-based regressors. |
| |
| Warning: This class should not be used directly. Use derived classes |
| instead. |
| """ |
|
|
| @abstractmethod |
| def __init__( |
| self, |
| estimator, |
| n_estimators=100, |
| *, |
| estimator_params=tuple(), |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| max_samples=None, |
| ): |
| super().__init__( |
| estimator, |
| n_estimators=n_estimators, |
| estimator_params=estimator_params, |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| max_samples=max_samples, |
| ) |
|
|
| def predict(self, X): |
| """ |
| Predict regression target for X. |
| |
| The predicted regression target of an input sample is computed as the |
| mean predicted regression targets of the trees in the forest. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Internally, its dtype will be converted to |
| ``dtype=np.float32``. If a sparse matrix is provided, it will be |
| converted into a sparse ``csr_matrix``. |
| |
| Returns |
| ------- |
| y : ndarray of shape (n_samples,) or (n_samples, n_outputs) |
| The predicted values. |
| """ |
| check_is_fitted(self) |
| |
| X = self._validate_X_predict(X) |
|
|
| |
| n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) |
|
|
| |
| if self.n_outputs_ > 1: |
| y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) |
| else: |
| y_hat = np.zeros((X.shape[0]), dtype=np.float64) |
|
|
| |
| lock = threading.Lock() |
| Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( |
| delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) |
| for e in self.estimators_ |
| ) |
|
|
| y_hat /= len(self.estimators_) |
|
|
| return y_hat |
|
|
| @staticmethod |
| def _get_oob_predictions(tree, X): |
| """Compute the OOB predictions for an individual tree. |
| |
| Parameters |
| ---------- |
| tree : DecisionTreeRegressor object |
| A single decision tree regressor. |
| X : ndarray of shape (n_samples, n_features) |
| The OOB samples. |
| |
| Returns |
| ------- |
| y_pred : ndarray of shape (n_samples, 1, n_outputs) |
| The OOB associated predictions. |
| """ |
| y_pred = tree.predict(X, check_input=False) |
| if y_pred.ndim == 1: |
| |
| y_pred = y_pred[:, np.newaxis, np.newaxis] |
| else: |
| |
| y_pred = y_pred[:, np.newaxis, :] |
| return y_pred |
|
|
| def _set_oob_score_and_attributes(self, X, y, scoring_function=None): |
| """Compute and set the OOB score and attributes. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data matrix. |
| y : ndarray of shape (n_samples, n_outputs) |
| The target matrix. |
| scoring_function : callable, default=None |
| Scoring function for OOB score. Defaults to `r2_score`. |
| """ |
| self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1) |
| if self.oob_prediction_.shape[-1] == 1: |
| |
| self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) |
|
|
| if scoring_function is None: |
| scoring_function = r2_score |
|
|
| self.oob_score_ = scoring_function(y, self.oob_prediction_) |
|
|
| def _compute_partial_dependence_recursion(self, grid, target_features): |
| """Fast partial dependence computation. |
| |
| Parameters |
| ---------- |
| grid : ndarray of shape (n_samples, n_target_features), dtype=DTYPE |
| The grid points on which the partial dependence should be |
| evaluated. |
| target_features : ndarray of shape (n_target_features), dtype=np.intp |
| The set of target features for which the partial dependence |
| should be evaluated. |
| |
| Returns |
| ------- |
| averaged_predictions : ndarray of shape (n_samples,) |
| The value of the partial dependence function on each grid point. |
| """ |
| grid = np.asarray(grid, dtype=DTYPE, order="C") |
| target_features = np.asarray(target_features, dtype=np.intp, order="C") |
| averaged_predictions = np.zeros( |
| shape=grid.shape[0], dtype=np.float64, order="C" |
| ) |
|
|
| for tree in self.estimators_: |
| |
| |
| tree.tree_.compute_partial_dependence( |
| grid, target_features, averaged_predictions |
| ) |
| |
| averaged_predictions /= len(self.estimators_) |
|
|
| return averaged_predictions |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.sparse = True |
| return tags |
|
|
|
|
| class RandomForestClassifier(ForestClassifier): |
| """ |
| A random forest classifier. |
| |
| A random forest is a meta estimator that fits a number of decision tree |
| classifiers on various sub-samples of the dataset and uses averaging to |
| improve the predictive accuracy and control over-fitting. |
| Trees in the forest use the best split strategy, i.e. equivalent to passing |
| `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeClassifier`. |
| The sub-sample size is controlled with the `max_samples` parameter if |
| `bootstrap=True` (default), otherwise the whole dataset is used to build |
| each tree. |
| |
| For a comparison between tree-based ensemble models see the example |
| :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`. |
| |
| Read more in the :ref:`User Guide <forest>`. |
| |
| Parameters |
| ---------- |
| n_estimators : int, default=100 |
| The number of trees in the forest. |
| |
| .. versionchanged:: 0.22 |
| The default value of ``n_estimators`` changed from 10 to 100 |
| in 0.22. |
| |
| criterion : {"gini", "entropy", "log_loss"}, default="gini" |
| The function to measure the quality of a split. Supported criteria are |
| "gini" for the Gini impurity and "log_loss" and "entropy" both for the |
| Shannon information gain, see :ref:`tree_mathematical_formulation`. |
| Note: This parameter is tree-specific. |
| |
| max_depth : int, default=None |
| The maximum depth of the tree. If None, then nodes are expanded until |
| all leaves are pure or until all leaves contain less than |
| min_samples_split samples. |
| |
| min_samples_split : int or float, default=2 |
| The minimum number of samples required to split an internal node: |
| |
| - If int, then consider `min_samples_split` as the minimum number. |
| - If float, then `min_samples_split` is a fraction and |
| `ceil(min_samples_split * n_samples)` are the minimum |
| number of samples for each split. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_samples_leaf : int or float, default=1 |
| The minimum number of samples required to be at a leaf node. |
| A split point at any depth will only be considered if it leaves at |
| least ``min_samples_leaf`` training samples in each of the left and |
| right branches. This may have the effect of smoothing the model, |
| especially in regression. |
| |
| - If int, then consider `min_samples_leaf` as the minimum number. |
| - If float, then `min_samples_leaf` is a fraction and |
| `ceil(min_samples_leaf * n_samples)` are the minimum |
| number of samples for each node. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_weight_fraction_leaf : float, default=0.0 |
| The minimum weighted fraction of the sum total of weights (of all |
| the input samples) required to be at a leaf node. Samples have |
| equal weight when sample_weight is not provided. |
| |
| max_features : {"sqrt", "log2", None}, int or float, default="sqrt" |
| The number of features to consider when looking for the best split: |
| |
| - If int, then consider `max_features` features at each split. |
| - If float, then `max_features` is a fraction and |
| `max(1, int(max_features * n_features_in_))` features are considered at each |
| split. |
| - If "sqrt", then `max_features=sqrt(n_features)`. |
| - If "log2", then `max_features=log2(n_features)`. |
| - If None, then `max_features=n_features`. |
| |
| .. versionchanged:: 1.1 |
| The default of `max_features` changed from `"auto"` to `"sqrt"`. |
| |
| Note: the search for a split does not stop until at least one |
| valid partition of the node samples is found, even if it requires to |
| effectively inspect more than ``max_features`` features. |
| |
| max_leaf_nodes : int, default=None |
| Grow trees with ``max_leaf_nodes`` in best-first fashion. |
| Best nodes are defined as relative reduction in impurity. |
| If None then unlimited number of leaf nodes. |
| |
| min_impurity_decrease : float, default=0.0 |
| A node will be split if this split induces a decrease of the impurity |
| greater than or equal to this value. |
| |
| The weighted impurity decrease equation is the following:: |
| |
| N_t / N * (impurity - N_t_R / N_t * right_impurity |
| - N_t_L / N_t * left_impurity) |
| |
| where ``N`` is the total number of samples, ``N_t`` is the number of |
| samples at the current node, ``N_t_L`` is the number of samples in the |
| left child, and ``N_t_R`` is the number of samples in the right child. |
| |
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, |
| if ``sample_weight`` is passed. |
| |
| .. versionadded:: 0.19 |
| |
| bootstrap : bool, default=True |
| Whether bootstrap samples are used when building trees. If False, the |
| whole dataset is used to build each tree. |
| |
| oob_score : bool or callable, default=False |
| Whether to use out-of-bag samples to estimate the generalization score. |
| By default, :func:`~sklearn.metrics.accuracy_score` is used. |
| Provide a callable with signature `metric(y_true, y_pred)` to use a |
| custom metric. Only available if `bootstrap=True`. |
| |
| n_jobs : int, default=None |
| The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, |
| :meth:`decision_path` and :meth:`apply` are all parallelized over the |
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` |
| context. ``-1`` means using all processors. See :term:`Glossary |
| <n_jobs>` for more details. |
| |
| random_state : int, RandomState instance or None, default=None |
| Controls both the randomness of the bootstrapping of the samples used |
| when building trees (if ``bootstrap=True``) and the sampling of the |
| features to consider when looking for the best split at each node |
| (if ``max_features < n_features``). |
| See :term:`Glossary <random_state>` for details. |
| |
| verbose : int, default=0 |
| Controls the verbosity when fitting and predicting. |
| |
| warm_start : bool, default=False |
| When set to ``True``, reuse the solution of the previous call to fit |
| and add more estimators to the ensemble, otherwise, just fit a whole |
| new forest. See :term:`Glossary <warm_start>` and |
| :ref:`tree_ensemble_warm_start` for details. |
| |
| class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ |
| default=None |
| Weights associated with classes in the form ``{class_label: weight}``. |
| If not given, all classes are supposed to have weight one. For |
| multi-output problems, a list of dicts can be provided in the same |
| order as the columns of y. |
| |
| Note that for multioutput (including multilabel) weights should be |
| defined for each class of every column in its own dict. For example, |
| for four-class multilabel classification weights should be |
| [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of |
| [{1:1}, {2:5}, {3:1}, {4:1}]. |
| |
| The "balanced" mode uses the values of y to automatically adjust |
| weights inversely proportional to class frequencies in the input data |
| as ``n_samples / (n_classes * np.bincount(y))`` |
| |
| The "balanced_subsample" mode is the same as "balanced" except that |
| weights are computed based on the bootstrap sample for every tree |
| grown. |
| |
| For multi-output, the weights of each column of y will be multiplied. |
| |
| Note that these weights will be multiplied with sample_weight (passed |
| through the fit method) if sample_weight is specified. |
| |
| ccp_alpha : non-negative float, default=0.0 |
| Complexity parameter used for Minimal Cost-Complexity Pruning. The |
| subtree with the largest cost complexity that is smaller than |
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See |
| :ref:`minimal_cost_complexity_pruning` for details. See |
| :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` |
| for an example of such pruning. |
| |
| .. versionadded:: 0.22 |
| |
| max_samples : int or float, default=None |
| If bootstrap is True, the number of samples to draw from X |
| to train each base estimator. |
| |
| - If None (default), then draw `X.shape[0]` samples. |
| - If int, then draw `max_samples` samples. |
| - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, |
| `max_samples` should be in the interval `(0.0, 1.0]`. |
| |
| .. versionadded:: 0.22 |
| |
| monotonic_cst : array-like of int of shape (n_features), default=None |
| Indicates the monotonicity constraint to enforce on each feature. |
| - 1: monotonic increase |
| - 0: no constraint |
| - -1: monotonic decrease |
| |
| If monotonic_cst is None, no constraints are applied. |
| |
| Monotonicity constraints are not supported for: |
| - multiclass classifications (i.e. when `n_classes > 2`), |
| - multioutput classifications (i.e. when `n_outputs_ > 1`), |
| - classifications trained on data with missing values. |
| |
| The constraints hold over the probability of the positive class. |
| |
| Read more in the :ref:`User Guide <monotonic_cst_gbdt>`. |
| |
| .. versionadded:: 1.4 |
| |
| Attributes |
| ---------- |
| estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` |
| The child estimator template used to create the collection of fitted |
| sub-estimators. |
| |
| .. versionadded:: 1.2 |
| `base_estimator_` was renamed to `estimator_`. |
| |
| estimators_ : list of DecisionTreeClassifier |
| The collection of fitted sub-estimators. |
| |
| classes_ : ndarray of shape (n_classes,) or a list of such arrays |
| The classes labels (single output problem), or a list of arrays of |
| class labels (multi-output problem). |
| |
| n_classes_ : int or list |
| The number of classes (single output problem), or a list containing the |
| number of classes for each output (multi-output problem). |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| n_outputs_ : int |
| The number of outputs when ``fit`` is performed. |
| |
| feature_importances_ : ndarray of shape (n_features,) |
| The impurity-based feature importances. |
| The higher, the more important the feature. |
| The importance of a feature is computed as the (normalized) |
| total reduction of the criterion brought by that feature. It is also |
| known as the Gini importance. |
| |
| Warning: impurity-based feature importances can be misleading for |
| high cardinality features (many unique values). See |
| :func:`sklearn.inspection.permutation_importance` as an alternative. |
| |
| oob_score_ : float |
| Score of the training dataset obtained using an out-of-bag estimate. |
| This attribute exists only when ``oob_score`` is True. |
| |
| oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ |
| (n_samples, n_classes, n_outputs) |
| Decision function computed with out-of-bag estimate on the training |
| set. If n_estimators is small it might be possible that a data point |
| was never left out during the bootstrap. In this case, |
| `oob_decision_function_` might contain NaN. This attribute exists |
| only when ``oob_score`` is True. |
| |
| estimators_samples_ : list of arrays |
| The subset of drawn samples (i.e., the in-bag samples) for each base |
| estimator. Each subset is defined by an array of the indices selected. |
| |
| .. versionadded:: 1.4 |
| |
| See Also |
| -------- |
| sklearn.tree.DecisionTreeClassifier : A decision tree classifier. |
| sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized |
| tree classifiers. |
| sklearn.ensemble.HistGradientBoostingClassifier : A Histogram-based Gradient |
| Boosting Classification Tree, very fast for big datasets (n_samples >= |
| 10_000). |
| |
| Notes |
| ----- |
| The default values for the parameters controlling the size of the trees |
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and |
| unpruned trees which can potentially be very large on some data sets. To |
| reduce memory consumption, the complexity and size of the trees should be |
| controlled by setting those parameter values. |
| |
| The features are always randomly permuted at each split. Therefore, |
| the best found split may vary, even with the same training data, |
| ``max_features=n_features`` and ``bootstrap=False``, if the improvement |
| of the criterion is identical for several splits enumerated during the |
| search of the best split. To obtain a deterministic behaviour during |
| fitting, ``random_state`` has to be fixed. |
| |
| References |
| ---------- |
| .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. |
| |
| Examples |
| -------- |
| >>> from sklearn.ensemble import RandomForestClassifier |
| >>> from sklearn.datasets import make_classification |
| >>> X, y = make_classification(n_samples=1000, n_features=4, |
| ... n_informative=2, n_redundant=0, |
| ... random_state=0, shuffle=False) |
| >>> clf = RandomForestClassifier(max_depth=2, random_state=0) |
| >>> clf.fit(X, y) |
| RandomForestClassifier(...) |
| >>> print(clf.predict([[0, 0, 0, 0]])) |
| [1] |
| """ |
|
|
| _parameter_constraints: dict = { |
| **ForestClassifier._parameter_constraints, |
| **DecisionTreeClassifier._parameter_constraints, |
| "class_weight": [ |
| StrOptions({"balanced_subsample", "balanced"}), |
| dict, |
| list, |
| None, |
| ], |
| } |
| _parameter_constraints.pop("splitter") |
|
|
| def __init__( |
| self, |
| n_estimators=100, |
| *, |
| criterion="gini", |
| max_depth=None, |
| min_samples_split=2, |
| min_samples_leaf=1, |
| min_weight_fraction_leaf=0.0, |
| max_features="sqrt", |
| max_leaf_nodes=None, |
| min_impurity_decrease=0.0, |
| bootstrap=True, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| class_weight=None, |
| ccp_alpha=0.0, |
| max_samples=None, |
| monotonic_cst=None, |
| ): |
| super().__init__( |
| estimator=DecisionTreeClassifier(), |
| n_estimators=n_estimators, |
| estimator_params=( |
| "criterion", |
| "max_depth", |
| "min_samples_split", |
| "min_samples_leaf", |
| "min_weight_fraction_leaf", |
| "max_features", |
| "max_leaf_nodes", |
| "min_impurity_decrease", |
| "random_state", |
| "ccp_alpha", |
| "monotonic_cst", |
| ), |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| class_weight=class_weight, |
| max_samples=max_samples, |
| ) |
|
|
| self.criterion = criterion |
| self.max_depth = max_depth |
| self.min_samples_split = min_samples_split |
| self.min_samples_leaf = min_samples_leaf |
| self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| self.max_features = max_features |
| self.max_leaf_nodes = max_leaf_nodes |
| self.min_impurity_decrease = min_impurity_decrease |
| self.monotonic_cst = monotonic_cst |
| self.ccp_alpha = ccp_alpha |
|
|
|
|
| class RandomForestRegressor(ForestRegressor): |
| """ |
| A random forest regressor. |
| |
| A random forest is a meta estimator that fits a number of decision tree |
| regressors on various sub-samples of the dataset and uses averaging to |
| improve the predictive accuracy and control over-fitting. |
| Trees in the forest use the best split strategy, i.e. equivalent to passing |
| `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`. |
| The sub-sample size is controlled with the `max_samples` parameter if |
| `bootstrap=True` (default), otherwise the whole dataset is used to build |
| each tree. |
| |
| For a comparison between tree-based ensemble models see the example |
| :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`. |
| |
| Read more in the :ref:`User Guide <forest>`. |
| |
| Parameters |
| ---------- |
| n_estimators : int, default=100 |
| The number of trees in the forest. |
| |
| .. versionchanged:: 0.22 |
| The default value of ``n_estimators`` changed from 10 to 100 |
| in 0.22. |
| |
| criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \ |
| default="squared_error" |
| The function to measure the quality of a split. Supported criteria |
| are "squared_error" for the mean squared error, which is equal to |
| variance reduction as feature selection criterion and minimizes the L2 |
| loss using the mean of each terminal node, "friedman_mse", which uses |
| mean squared error with Friedman's improvement score for potential |
| splits, "absolute_error" for the mean absolute error, which minimizes |
| the L1 loss using the median of each terminal node, and "poisson" which |
| uses reduction in Poisson deviance to find splits. |
| Training using "absolute_error" is significantly slower |
| than when using "squared_error". |
| |
| .. versionadded:: 0.18 |
| Mean Absolute Error (MAE) criterion. |
| |
| .. versionadded:: 1.0 |
| Poisson criterion. |
| |
| max_depth : int, default=None |
| The maximum depth of the tree. If None, then nodes are expanded until |
| all leaves are pure or until all leaves contain less than |
| min_samples_split samples. |
| |
| min_samples_split : int or float, default=2 |
| The minimum number of samples required to split an internal node: |
| |
| - If int, then consider `min_samples_split` as the minimum number. |
| - If float, then `min_samples_split` is a fraction and |
| `ceil(min_samples_split * n_samples)` are the minimum |
| number of samples for each split. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_samples_leaf : int or float, default=1 |
| The minimum number of samples required to be at a leaf node. |
| A split point at any depth will only be considered if it leaves at |
| least ``min_samples_leaf`` training samples in each of the left and |
| right branches. This may have the effect of smoothing the model, |
| especially in regression. |
| |
| - If int, then consider `min_samples_leaf` as the minimum number. |
| - If float, then `min_samples_leaf` is a fraction and |
| `ceil(min_samples_leaf * n_samples)` are the minimum |
| number of samples for each node. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_weight_fraction_leaf : float, default=0.0 |
| The minimum weighted fraction of the sum total of weights (of all |
| the input samples) required to be at a leaf node. Samples have |
| equal weight when sample_weight is not provided. |
| |
| max_features : {"sqrt", "log2", None}, int or float, default=1.0 |
| The number of features to consider when looking for the best split: |
| |
| - If int, then consider `max_features` features at each split. |
| - If float, then `max_features` is a fraction and |
| `max(1, int(max_features * n_features_in_))` features are considered at each |
| split. |
| - If "sqrt", then `max_features=sqrt(n_features)`. |
| - If "log2", then `max_features=log2(n_features)`. |
| - If None or 1.0, then `max_features=n_features`. |
| |
| .. note:: |
| The default of 1.0 is equivalent to bagged trees and more |
| randomness can be achieved by setting smaller values, e.g. 0.3. |
| |
| .. versionchanged:: 1.1 |
| The default of `max_features` changed from `"auto"` to 1.0. |
| |
| Note: the search for a split does not stop until at least one |
| valid partition of the node samples is found, even if it requires to |
| effectively inspect more than ``max_features`` features. |
| |
| max_leaf_nodes : int, default=None |
| Grow trees with ``max_leaf_nodes`` in best-first fashion. |
| Best nodes are defined as relative reduction in impurity. |
| If None then unlimited number of leaf nodes. |
| |
| min_impurity_decrease : float, default=0.0 |
| A node will be split if this split induces a decrease of the impurity |
| greater than or equal to this value. |
| |
| The weighted impurity decrease equation is the following:: |
| |
| N_t / N * (impurity - N_t_R / N_t * right_impurity |
| - N_t_L / N_t * left_impurity) |
| |
| where ``N`` is the total number of samples, ``N_t`` is the number of |
| samples at the current node, ``N_t_L`` is the number of samples in the |
| left child, and ``N_t_R`` is the number of samples in the right child. |
| |
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, |
| if ``sample_weight`` is passed. |
| |
| .. versionadded:: 0.19 |
| |
| bootstrap : bool, default=True |
| Whether bootstrap samples are used when building trees. If False, the |
| whole dataset is used to build each tree. |
| |
| oob_score : bool or callable, default=False |
| Whether to use out-of-bag samples to estimate the generalization score. |
| By default, :func:`~sklearn.metrics.r2_score` is used. |
| Provide a callable with signature `metric(y_true, y_pred)` to use a |
| custom metric. Only available if `bootstrap=True`. |
| |
| n_jobs : int, default=None |
| The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, |
| :meth:`decision_path` and :meth:`apply` are all parallelized over the |
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` |
| context. ``-1`` means using all processors. See :term:`Glossary |
| <n_jobs>` for more details. |
| |
| random_state : int, RandomState instance or None, default=None |
| Controls both the randomness of the bootstrapping of the samples used |
| when building trees (if ``bootstrap=True``) and the sampling of the |
| features to consider when looking for the best split at each node |
| (if ``max_features < n_features``). |
| See :term:`Glossary <random_state>` for details. |
| |
| verbose : int, default=0 |
| Controls the verbosity when fitting and predicting. |
| |
| warm_start : bool, default=False |
| When set to ``True``, reuse the solution of the previous call to fit |
| and add more estimators to the ensemble, otherwise, just fit a whole |
| new forest. See :term:`Glossary <warm_start>` and |
| :ref:`tree_ensemble_warm_start` for details. |
| |
| ccp_alpha : non-negative float, default=0.0 |
| Complexity parameter used for Minimal Cost-Complexity Pruning. The |
| subtree with the largest cost complexity that is smaller than |
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See |
| :ref:`minimal_cost_complexity_pruning` for details. See |
| :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` |
| for an example of such pruning. |
| |
| .. versionadded:: 0.22 |
| |
| max_samples : int or float, default=None |
| If bootstrap is True, the number of samples to draw from X |
| to train each base estimator. |
| |
| - If None (default), then draw `X.shape[0]` samples. |
| - If int, then draw `max_samples` samples. |
| - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, |
| `max_samples` should be in the interval `(0.0, 1.0]`. |
| |
| .. versionadded:: 0.22 |
| |
| monotonic_cst : array-like of int of shape (n_features), default=None |
| Indicates the monotonicity constraint to enforce on each feature. |
| - 1: monotonically increasing |
| - 0: no constraint |
| - -1: monotonically decreasing |
| |
| If monotonic_cst is None, no constraints are applied. |
| |
| Monotonicity constraints are not supported for: |
| - multioutput regressions (i.e. when `n_outputs_ > 1`), |
| - regressions trained on data with missing values. |
| |
| Read more in the :ref:`User Guide <monotonic_cst_gbdt>`. |
| |
| .. versionadded:: 1.4 |
| |
| Attributes |
| ---------- |
| estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` |
| The child estimator template used to create the collection of fitted |
| sub-estimators. |
| |
| .. versionadded:: 1.2 |
| `base_estimator_` was renamed to `estimator_`. |
| |
| estimators_ : list of DecisionTreeRegressor |
| The collection of fitted sub-estimators. |
| |
| feature_importances_ : ndarray of shape (n_features,) |
| The impurity-based feature importances. |
| The higher, the more important the feature. |
| The importance of a feature is computed as the (normalized) |
| total reduction of the criterion brought by that feature. It is also |
| known as the Gini importance. |
| |
| Warning: impurity-based feature importances can be misleading for |
| high cardinality features (many unique values). See |
| :func:`sklearn.inspection.permutation_importance` as an alternative. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| n_outputs_ : int |
| The number of outputs when ``fit`` is performed. |
| |
| oob_score_ : float |
| Score of the training dataset obtained using an out-of-bag estimate. |
| This attribute exists only when ``oob_score`` is True. |
| |
| oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs) |
| Prediction computed with out-of-bag estimate on the training set. |
| This attribute exists only when ``oob_score`` is True. |
| |
| estimators_samples_ : list of arrays |
| The subset of drawn samples (i.e., the in-bag samples) for each base |
| estimator. Each subset is defined by an array of the indices selected. |
| |
| .. versionadded:: 1.4 |
| |
| See Also |
| -------- |
| sklearn.tree.DecisionTreeRegressor : A decision tree regressor. |
| sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized |
| tree regressors. |
| sklearn.ensemble.HistGradientBoostingRegressor : A Histogram-based Gradient |
| Boosting Regression Tree, very fast for big datasets (n_samples >= |
| 10_000). |
| |
| Notes |
| ----- |
| The default values for the parameters controlling the size of the trees |
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and |
| unpruned trees which can potentially be very large on some data sets. To |
| reduce memory consumption, the complexity and size of the trees should be |
| controlled by setting those parameter values. |
| |
| The features are always randomly permuted at each split. Therefore, |
| the best found split may vary, even with the same training data, |
| ``max_features=n_features`` and ``bootstrap=False``, if the improvement |
| of the criterion is identical for several splits enumerated during the |
| search of the best split. To obtain a deterministic behaviour during |
| fitting, ``random_state`` has to be fixed. |
| |
| The default value ``max_features=1.0`` uses ``n_features`` |
| rather than ``n_features / 3``. The latter was originally suggested in |
| [1], whereas the former was more recently justified empirically in [2]. |
| |
| References |
| ---------- |
| .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. |
| |
| .. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized |
| trees", Machine Learning, 63(1), 3-42, 2006. |
| |
| Examples |
| -------- |
| >>> from sklearn.ensemble import RandomForestRegressor |
| >>> from sklearn.datasets import make_regression |
| >>> X, y = make_regression(n_features=4, n_informative=2, |
| ... random_state=0, shuffle=False) |
| >>> regr = RandomForestRegressor(max_depth=2, random_state=0) |
| >>> regr.fit(X, y) |
| RandomForestRegressor(...) |
| >>> print(regr.predict([[0, 0, 0, 0]])) |
| [-8.32987858] |
| """ |
|
|
| _parameter_constraints: dict = { |
| **ForestRegressor._parameter_constraints, |
| **DecisionTreeRegressor._parameter_constraints, |
| } |
| _parameter_constraints.pop("splitter") |
|
|
| def __init__( |
| self, |
| n_estimators=100, |
| *, |
| criterion="squared_error", |
| max_depth=None, |
| min_samples_split=2, |
| min_samples_leaf=1, |
| min_weight_fraction_leaf=0.0, |
| max_features=1.0, |
| max_leaf_nodes=None, |
| min_impurity_decrease=0.0, |
| bootstrap=True, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| ccp_alpha=0.0, |
| max_samples=None, |
| monotonic_cst=None, |
| ): |
| super().__init__( |
| estimator=DecisionTreeRegressor(), |
| n_estimators=n_estimators, |
| estimator_params=( |
| "criterion", |
| "max_depth", |
| "min_samples_split", |
| "min_samples_leaf", |
| "min_weight_fraction_leaf", |
| "max_features", |
| "max_leaf_nodes", |
| "min_impurity_decrease", |
| "random_state", |
| "ccp_alpha", |
| "monotonic_cst", |
| ), |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| max_samples=max_samples, |
| ) |
|
|
| self.criterion = criterion |
| self.max_depth = max_depth |
| self.min_samples_split = min_samples_split |
| self.min_samples_leaf = min_samples_leaf |
| self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| self.max_features = max_features |
| self.max_leaf_nodes = max_leaf_nodes |
| self.min_impurity_decrease = min_impurity_decrease |
| self.ccp_alpha = ccp_alpha |
| self.monotonic_cst = monotonic_cst |
|
|
|
|
| class ExtraTreesClassifier(ForestClassifier): |
| """ |
| An extra-trees classifier. |
| |
| This class implements a meta estimator that fits a number of |
| randomized decision trees (a.k.a. extra-trees) on various sub-samples |
| of the dataset and uses averaging to improve the predictive accuracy |
| and control over-fitting. |
| |
| Read more in the :ref:`User Guide <forest>`. |
| |
| Parameters |
| ---------- |
| n_estimators : int, default=100 |
| The number of trees in the forest. |
| |
| .. versionchanged:: 0.22 |
| The default value of ``n_estimators`` changed from 10 to 100 |
| in 0.22. |
| |
| criterion : {"gini", "entropy", "log_loss"}, default="gini" |
| The function to measure the quality of a split. Supported criteria are |
| "gini" for the Gini impurity and "log_loss" and "entropy" both for the |
| Shannon information gain, see :ref:`tree_mathematical_formulation`. |
| Note: This parameter is tree-specific. |
| |
| max_depth : int, default=None |
| The maximum depth of the tree. If None, then nodes are expanded until |
| all leaves are pure or until all leaves contain less than |
| min_samples_split samples. |
| |
| min_samples_split : int or float, default=2 |
| The minimum number of samples required to split an internal node: |
| |
| - If int, then consider `min_samples_split` as the minimum number. |
| - If float, then `min_samples_split` is a fraction and |
| `ceil(min_samples_split * n_samples)` are the minimum |
| number of samples for each split. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_samples_leaf : int or float, default=1 |
| The minimum number of samples required to be at a leaf node. |
| A split point at any depth will only be considered if it leaves at |
| least ``min_samples_leaf`` training samples in each of the left and |
| right branches. This may have the effect of smoothing the model, |
| especially in regression. |
| |
| - If int, then consider `min_samples_leaf` as the minimum number. |
| - If float, then `min_samples_leaf` is a fraction and |
| `ceil(min_samples_leaf * n_samples)` are the minimum |
| number of samples for each node. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_weight_fraction_leaf : float, default=0.0 |
| The minimum weighted fraction of the sum total of weights (of all |
| the input samples) required to be at a leaf node. Samples have |
| equal weight when sample_weight is not provided. |
| |
| max_features : {"sqrt", "log2", None}, int or float, default="sqrt" |
| The number of features to consider when looking for the best split: |
| |
| - If int, then consider `max_features` features at each split. |
| - If float, then `max_features` is a fraction and |
| `max(1, int(max_features * n_features_in_))` features are considered at each |
| split. |
| - If "sqrt", then `max_features=sqrt(n_features)`. |
| - If "log2", then `max_features=log2(n_features)`. |
| - If None, then `max_features=n_features`. |
| |
| .. versionchanged:: 1.1 |
| The default of `max_features` changed from `"auto"` to `"sqrt"`. |
| |
| Note: the search for a split does not stop until at least one |
| valid partition of the node samples is found, even if it requires to |
| effectively inspect more than ``max_features`` features. |
| |
| max_leaf_nodes : int, default=None |
| Grow trees with ``max_leaf_nodes`` in best-first fashion. |
| Best nodes are defined as relative reduction in impurity. |
| If None then unlimited number of leaf nodes. |
| |
| min_impurity_decrease : float, default=0.0 |
| A node will be split if this split induces a decrease of the impurity |
| greater than or equal to this value. |
| |
| The weighted impurity decrease equation is the following:: |
| |
| N_t / N * (impurity - N_t_R / N_t * right_impurity |
| - N_t_L / N_t * left_impurity) |
| |
| where ``N`` is the total number of samples, ``N_t`` is the number of |
| samples at the current node, ``N_t_L`` is the number of samples in the |
| left child, and ``N_t_R`` is the number of samples in the right child. |
| |
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, |
| if ``sample_weight`` is passed. |
| |
| .. versionadded:: 0.19 |
| |
| bootstrap : bool, default=False |
| Whether bootstrap samples are used when building trees. If False, the |
| whole dataset is used to build each tree. |
| |
| oob_score : bool or callable, default=False |
| Whether to use out-of-bag samples to estimate the generalization score. |
| By default, :func:`~sklearn.metrics.accuracy_score` is used. |
| Provide a callable with signature `metric(y_true, y_pred)` to use a |
| custom metric. Only available if `bootstrap=True`. |
| |
| n_jobs : int, default=None |
| The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, |
| :meth:`decision_path` and :meth:`apply` are all parallelized over the |
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` |
| context. ``-1`` means using all processors. See :term:`Glossary |
| <n_jobs>` for more details. |
| |
| random_state : int, RandomState instance or None, default=None |
| Controls 3 sources of randomness: |
| |
| - the bootstrapping of the samples used when building trees |
| (if ``bootstrap=True``) |
| - the sampling of the features to consider when looking for the best |
| split at each node (if ``max_features < n_features``) |
| - the draw of the splits for each of the `max_features` |
| |
| See :term:`Glossary <random_state>` for details. |
| |
| verbose : int, default=0 |
| Controls the verbosity when fitting and predicting. |
| |
| warm_start : bool, default=False |
| When set to ``True``, reuse the solution of the previous call to fit |
| and add more estimators to the ensemble, otherwise, just fit a whole |
| new forest. See :term:`Glossary <warm_start>` and |
| :ref:`tree_ensemble_warm_start` for details. |
| |
| class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ |
| default=None |
| Weights associated with classes in the form ``{class_label: weight}``. |
| If not given, all classes are supposed to have weight one. For |
| multi-output problems, a list of dicts can be provided in the same |
| order as the columns of y. |
| |
| Note that for multioutput (including multilabel) weights should be |
| defined for each class of every column in its own dict. For example, |
| for four-class multilabel classification weights should be |
| [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of |
| [{1:1}, {2:5}, {3:1}, {4:1}]. |
| |
| The "balanced" mode uses the values of y to automatically adjust |
| weights inversely proportional to class frequencies in the input data |
| as ``n_samples / (n_classes * np.bincount(y))`` |
| |
| The "balanced_subsample" mode is the same as "balanced" except that |
| weights are computed based on the bootstrap sample for every tree |
| grown. |
| |
| For multi-output, the weights of each column of y will be multiplied. |
| |
| Note that these weights will be multiplied with sample_weight (passed |
| through the fit method) if sample_weight is specified. |
| |
| ccp_alpha : non-negative float, default=0.0 |
| Complexity parameter used for Minimal Cost-Complexity Pruning. The |
| subtree with the largest cost complexity that is smaller than |
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See |
| :ref:`minimal_cost_complexity_pruning` for details. See |
| :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` |
| for an example of such pruning. |
| |
| .. versionadded:: 0.22 |
| |
| max_samples : int or float, default=None |
| If bootstrap is True, the number of samples to draw from X |
| to train each base estimator. |
| |
| - If None (default), then draw `X.shape[0]` samples. |
| - If int, then draw `max_samples` samples. |
| - If float, then draw `max_samples * X.shape[0]` samples. Thus, |
| `max_samples` should be in the interval `(0.0, 1.0]`. |
| |
| .. versionadded:: 0.22 |
| |
| monotonic_cst : array-like of int of shape (n_features), default=None |
| Indicates the monotonicity constraint to enforce on each feature. |
| - 1: monotonically increasing |
| - 0: no constraint |
| - -1: monotonically decreasing |
| |
| If monotonic_cst is None, no constraints are applied. |
| |
| Monotonicity constraints are not supported for: |
| - multiclass classifications (i.e. when `n_classes > 2`), |
| - multioutput classifications (i.e. when `n_outputs_ > 1`), |
| - classifications trained on data with missing values. |
| |
| The constraints hold over the probability of the positive class. |
| |
| Read more in the :ref:`User Guide <monotonic_cst_gbdt>`. |
| |
| .. versionadded:: 1.4 |
| |
| Attributes |
| ---------- |
| estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` |
| The child estimator template used to create the collection of fitted |
| sub-estimators. |
| |
| .. versionadded:: 1.2 |
| `base_estimator_` was renamed to `estimator_`. |
| |
| estimators_ : list of DecisionTreeClassifier |
| The collection of fitted sub-estimators. |
| |
| classes_ : ndarray of shape (n_classes,) or a list of such arrays |
| The classes labels (single output problem), or a list of arrays of |
| class labels (multi-output problem). |
| |
| n_classes_ : int or list |
| The number of classes (single output problem), or a list containing the |
| number of classes for each output (multi-output problem). |
| |
| feature_importances_ : ndarray of shape (n_features,) |
| The impurity-based feature importances. |
| The higher, the more important the feature. |
| The importance of a feature is computed as the (normalized) |
| total reduction of the criterion brought by that feature. It is also |
| known as the Gini importance. |
| |
| Warning: impurity-based feature importances can be misleading for |
| high cardinality features (many unique values). See |
| :func:`sklearn.inspection.permutation_importance` as an alternative. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| n_outputs_ : int |
| The number of outputs when ``fit`` is performed. |
| |
| oob_score_ : float |
| Score of the training dataset obtained using an out-of-bag estimate. |
| This attribute exists only when ``oob_score`` is True. |
| |
| oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ |
| (n_samples, n_classes, n_outputs) |
| Decision function computed with out-of-bag estimate on the training |
| set. If n_estimators is small it might be possible that a data point |
| was never left out during the bootstrap. In this case, |
| `oob_decision_function_` might contain NaN. This attribute exists |
| only when ``oob_score`` is True. |
| |
| estimators_samples_ : list of arrays |
| The subset of drawn samples (i.e., the in-bag samples) for each base |
| estimator. Each subset is defined by an array of the indices selected. |
| |
| .. versionadded:: 1.4 |
| |
| See Also |
| -------- |
| ExtraTreesRegressor : An extra-trees regressor with random splits. |
| RandomForestClassifier : A random forest classifier with optimal splits. |
| RandomForestRegressor : Ensemble regressor using trees with optimal splits. |
| |
| Notes |
| ----- |
| The default values for the parameters controlling the size of the trees |
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and |
| unpruned trees which can potentially be very large on some data sets. To |
| reduce memory consumption, the complexity and size of the trees should be |
| controlled by setting those parameter values. |
| |
| References |
| ---------- |
| .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized |
| trees", Machine Learning, 63(1), 3-42, 2006. |
| |
| Examples |
| -------- |
| >>> from sklearn.ensemble import ExtraTreesClassifier |
| >>> from sklearn.datasets import make_classification |
| >>> X, y = make_classification(n_features=4, random_state=0) |
| >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0) |
| >>> clf.fit(X, y) |
| ExtraTreesClassifier(random_state=0) |
| >>> clf.predict([[0, 0, 0, 0]]) |
| array([1]) |
| """ |
|
|
| _parameter_constraints: dict = { |
| **ForestClassifier._parameter_constraints, |
| **DecisionTreeClassifier._parameter_constraints, |
| "class_weight": [ |
| StrOptions({"balanced_subsample", "balanced"}), |
| dict, |
| list, |
| None, |
| ], |
| } |
| _parameter_constraints.pop("splitter") |
|
|
| def __init__( |
| self, |
| n_estimators=100, |
| *, |
| criterion="gini", |
| max_depth=None, |
| min_samples_split=2, |
| min_samples_leaf=1, |
| min_weight_fraction_leaf=0.0, |
| max_features="sqrt", |
| max_leaf_nodes=None, |
| min_impurity_decrease=0.0, |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| class_weight=None, |
| ccp_alpha=0.0, |
| max_samples=None, |
| monotonic_cst=None, |
| ): |
| super().__init__( |
| estimator=ExtraTreeClassifier(), |
| n_estimators=n_estimators, |
| estimator_params=( |
| "criterion", |
| "max_depth", |
| "min_samples_split", |
| "min_samples_leaf", |
| "min_weight_fraction_leaf", |
| "max_features", |
| "max_leaf_nodes", |
| "min_impurity_decrease", |
| "random_state", |
| "ccp_alpha", |
| "monotonic_cst", |
| ), |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| class_weight=class_weight, |
| max_samples=max_samples, |
| ) |
|
|
| self.criterion = criterion |
| self.max_depth = max_depth |
| self.min_samples_split = min_samples_split |
| self.min_samples_leaf = min_samples_leaf |
| self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| self.max_features = max_features |
| self.max_leaf_nodes = max_leaf_nodes |
| self.min_impurity_decrease = min_impurity_decrease |
| self.ccp_alpha = ccp_alpha |
| self.monotonic_cst = monotonic_cst |
|
|
|
|
| class ExtraTreesRegressor(ForestRegressor): |
| """ |
| An extra-trees regressor. |
| |
| This class implements a meta estimator that fits a number of |
| randomized decision trees (a.k.a. extra-trees) on various sub-samples |
| of the dataset and uses averaging to improve the predictive accuracy |
| and control over-fitting. |
| |
| Read more in the :ref:`User Guide <forest>`. |
| |
| Parameters |
| ---------- |
| n_estimators : int, default=100 |
| The number of trees in the forest. |
| |
| .. versionchanged:: 0.22 |
| The default value of ``n_estimators`` changed from 10 to 100 |
| in 0.22. |
| |
| criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \ |
| default="squared_error" |
| The function to measure the quality of a split. Supported criteria |
| are "squared_error" for the mean squared error, which is equal to |
| variance reduction as feature selection criterion and minimizes the L2 |
| loss using the mean of each terminal node, "friedman_mse", which uses |
| mean squared error with Friedman's improvement score for potential |
| splits, "absolute_error" for the mean absolute error, which minimizes |
| the L1 loss using the median of each terminal node, and "poisson" which |
| uses reduction in Poisson deviance to find splits. |
| Training using "absolute_error" is significantly slower |
| than when using "squared_error". |
| |
| .. versionadded:: 0.18 |
| Mean Absolute Error (MAE) criterion. |
| |
| max_depth : int, default=None |
| The maximum depth of the tree. If None, then nodes are expanded until |
| all leaves are pure or until all leaves contain less than |
| min_samples_split samples. |
| |
| min_samples_split : int or float, default=2 |
| The minimum number of samples required to split an internal node: |
| |
| - If int, then consider `min_samples_split` as the minimum number. |
| - If float, then `min_samples_split` is a fraction and |
| `ceil(min_samples_split * n_samples)` are the minimum |
| number of samples for each split. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_samples_leaf : int or float, default=1 |
| The minimum number of samples required to be at a leaf node. |
| A split point at any depth will only be considered if it leaves at |
| least ``min_samples_leaf`` training samples in each of the left and |
| right branches. This may have the effect of smoothing the model, |
| especially in regression. |
| |
| - If int, then consider `min_samples_leaf` as the minimum number. |
| - If float, then `min_samples_leaf` is a fraction and |
| `ceil(min_samples_leaf * n_samples)` are the minimum |
| number of samples for each node. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_weight_fraction_leaf : float, default=0.0 |
| The minimum weighted fraction of the sum total of weights (of all |
| the input samples) required to be at a leaf node. Samples have |
| equal weight when sample_weight is not provided. |
| |
| max_features : {"sqrt", "log2", None}, int or float, default=1.0 |
| The number of features to consider when looking for the best split: |
| |
| - If int, then consider `max_features` features at each split. |
| - If float, then `max_features` is a fraction and |
| `max(1, int(max_features * n_features_in_))` features are considered at each |
| split. |
| - If "sqrt", then `max_features=sqrt(n_features)`. |
| - If "log2", then `max_features=log2(n_features)`. |
| - If None or 1.0, then `max_features=n_features`. |
| |
| .. note:: |
| The default of 1.0 is equivalent to bagged trees and more |
| randomness can be achieved by setting smaller values, e.g. 0.3. |
| |
| .. versionchanged:: 1.1 |
| The default of `max_features` changed from `"auto"` to 1.0. |
| |
| Note: the search for a split does not stop until at least one |
| valid partition of the node samples is found, even if it requires to |
| effectively inspect more than ``max_features`` features. |
| |
| max_leaf_nodes : int, default=None |
| Grow trees with ``max_leaf_nodes`` in best-first fashion. |
| Best nodes are defined as relative reduction in impurity. |
| If None then unlimited number of leaf nodes. |
| |
| min_impurity_decrease : float, default=0.0 |
| A node will be split if this split induces a decrease of the impurity |
| greater than or equal to this value. |
| |
| The weighted impurity decrease equation is the following:: |
| |
| N_t / N * (impurity - N_t_R / N_t * right_impurity |
| - N_t_L / N_t * left_impurity) |
| |
| where ``N`` is the total number of samples, ``N_t`` is the number of |
| samples at the current node, ``N_t_L`` is the number of samples in the |
| left child, and ``N_t_R`` is the number of samples in the right child. |
| |
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, |
| if ``sample_weight`` is passed. |
| |
| .. versionadded:: 0.19 |
| |
| bootstrap : bool, default=False |
| Whether bootstrap samples are used when building trees. If False, the |
| whole dataset is used to build each tree. |
| |
| oob_score : bool or callable, default=False |
| Whether to use out-of-bag samples to estimate the generalization score. |
| By default, :func:`~sklearn.metrics.r2_score` is used. |
| Provide a callable with signature `metric(y_true, y_pred)` to use a |
| custom metric. Only available if `bootstrap=True`. |
| |
| n_jobs : int, default=None |
| The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, |
| :meth:`decision_path` and :meth:`apply` are all parallelized over the |
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` |
| context. ``-1`` means using all processors. See :term:`Glossary |
| <n_jobs>` for more details. |
| |
| random_state : int, RandomState instance or None, default=None |
| Controls 3 sources of randomness: |
| |
| - the bootstrapping of the samples used when building trees |
| (if ``bootstrap=True``) |
| - the sampling of the features to consider when looking for the best |
| split at each node (if ``max_features < n_features``) |
| - the draw of the splits for each of the `max_features` |
| |
| See :term:`Glossary <random_state>` for details. |
| |
| verbose : int, default=0 |
| Controls the verbosity when fitting and predicting. |
| |
| warm_start : bool, default=False |
| When set to ``True``, reuse the solution of the previous call to fit |
| and add more estimators to the ensemble, otherwise, just fit a whole |
| new forest. See :term:`Glossary <warm_start>` and |
| :ref:`tree_ensemble_warm_start` for details. |
| |
| ccp_alpha : non-negative float, default=0.0 |
| Complexity parameter used for Minimal Cost-Complexity Pruning. The |
| subtree with the largest cost complexity that is smaller than |
| ``ccp_alpha`` will be chosen. By default, no pruning is performed. See |
| :ref:`minimal_cost_complexity_pruning` for details. See |
| :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` |
| for an example of such pruning. |
| |
| .. versionadded:: 0.22 |
| |
| max_samples : int or float, default=None |
| If bootstrap is True, the number of samples to draw from X |
| to train each base estimator. |
| |
| - If None (default), then draw `X.shape[0]` samples. |
| - If int, then draw `max_samples` samples. |
| - If float, then draw `max_samples * X.shape[0]` samples. Thus, |
| `max_samples` should be in the interval `(0.0, 1.0]`. |
| |
| .. versionadded:: 0.22 |
| |
| monotonic_cst : array-like of int of shape (n_features), default=None |
| Indicates the monotonicity constraint to enforce on each feature. |
| - 1: monotonically increasing |
| - 0: no constraint |
| - -1: monotonically decreasing |
| |
| If monotonic_cst is None, no constraints are applied. |
| |
| Monotonicity constraints are not supported for: |
| - multioutput regressions (i.e. when `n_outputs_ > 1`), |
| - regressions trained on data with missing values. |
| |
| Read more in the :ref:`User Guide <monotonic_cst_gbdt>`. |
| |
| .. versionadded:: 1.4 |
| |
| Attributes |
| ---------- |
| estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` |
| The child estimator template used to create the collection of fitted |
| sub-estimators. |
| |
| .. versionadded:: 1.2 |
| `base_estimator_` was renamed to `estimator_`. |
| |
| estimators_ : list of DecisionTreeRegressor |
| The collection of fitted sub-estimators. |
| |
| feature_importances_ : ndarray of shape (n_features,) |
| The impurity-based feature importances. |
| The higher, the more important the feature. |
| The importance of a feature is computed as the (normalized) |
| total reduction of the criterion brought by that feature. It is also |
| known as the Gini importance. |
| |
| Warning: impurity-based feature importances can be misleading for |
| high cardinality features (many unique values). See |
| :func:`sklearn.inspection.permutation_importance` as an alternative. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| n_outputs_ : int |
| The number of outputs. |
| |
| oob_score_ : float |
| Score of the training dataset obtained using an out-of-bag estimate. |
| This attribute exists only when ``oob_score`` is True. |
| |
| oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs) |
| Prediction computed with out-of-bag estimate on the training set. |
| This attribute exists only when ``oob_score`` is True. |
| |
| estimators_samples_ : list of arrays |
| The subset of drawn samples (i.e., the in-bag samples) for each base |
| estimator. Each subset is defined by an array of the indices selected. |
| |
| .. versionadded:: 1.4 |
| |
| See Also |
| -------- |
| ExtraTreesClassifier : An extra-trees classifier with random splits. |
| RandomForestClassifier : A random forest classifier with optimal splits. |
| RandomForestRegressor : Ensemble regressor using trees with optimal splits. |
| |
| Notes |
| ----- |
| The default values for the parameters controlling the size of the trees |
| (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and |
| unpruned trees which can potentially be very large on some data sets. To |
| reduce memory consumption, the complexity and size of the trees should be |
| controlled by setting those parameter values. |
| |
| References |
| ---------- |
| .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", |
| Machine Learning, 63(1), 3-42, 2006. |
| |
| Examples |
| -------- |
| >>> from sklearn.datasets import load_diabetes |
| >>> from sklearn.model_selection import train_test_split |
| >>> from sklearn.ensemble import ExtraTreesRegressor |
| >>> X, y = load_diabetes(return_X_y=True) |
| >>> X_train, X_test, y_train, y_test = train_test_split( |
| ... X, y, random_state=0) |
| >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit( |
| ... X_train, y_train) |
| >>> reg.score(X_test, y_test) |
| 0.2727... |
| """ |
|
|
| _parameter_constraints: dict = { |
| **ForestRegressor._parameter_constraints, |
| **DecisionTreeRegressor._parameter_constraints, |
| } |
| _parameter_constraints.pop("splitter") |
|
|
| def __init__( |
| self, |
| n_estimators=100, |
| *, |
| criterion="squared_error", |
| max_depth=None, |
| min_samples_split=2, |
| min_samples_leaf=1, |
| min_weight_fraction_leaf=0.0, |
| max_features=1.0, |
| max_leaf_nodes=None, |
| min_impurity_decrease=0.0, |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| ccp_alpha=0.0, |
| max_samples=None, |
| monotonic_cst=None, |
| ): |
| super().__init__( |
| estimator=ExtraTreeRegressor(), |
| n_estimators=n_estimators, |
| estimator_params=( |
| "criterion", |
| "max_depth", |
| "min_samples_split", |
| "min_samples_leaf", |
| "min_weight_fraction_leaf", |
| "max_features", |
| "max_leaf_nodes", |
| "min_impurity_decrease", |
| "random_state", |
| "ccp_alpha", |
| "monotonic_cst", |
| ), |
| bootstrap=bootstrap, |
| oob_score=oob_score, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| max_samples=max_samples, |
| ) |
|
|
| self.criterion = criterion |
| self.max_depth = max_depth |
| self.min_samples_split = min_samples_split |
| self.min_samples_leaf = min_samples_leaf |
| self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| self.max_features = max_features |
| self.max_leaf_nodes = max_leaf_nodes |
| self.min_impurity_decrease = min_impurity_decrease |
| self.ccp_alpha = ccp_alpha |
| self.monotonic_cst = monotonic_cst |
|
|
|
|
| class RandomTreesEmbedding(TransformerMixin, BaseForest): |
| """ |
| An ensemble of totally random trees. |
| |
| An unsupervised transformation of a dataset to a high-dimensional |
| sparse representation. A datapoint is coded according to which leaf of |
| each tree it is sorted into. Using a one-hot encoding of the leaves, |
| this leads to a binary coding with as many ones as there are trees in |
| the forest. |
| |
| The dimensionality of the resulting representation is |
| ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``, |
| the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``. |
| |
| For an example of applying Random Trees Embedding to non-linear |
| classification, see |
| :ref:`sphx_glr_auto_examples_ensemble_plot_random_forest_embedding.py`. |
| |
| Read more in the :ref:`User Guide <random_trees_embedding>`. |
| |
| Parameters |
| ---------- |
| n_estimators : int, default=100 |
| Number of trees in the forest. |
| |
| .. versionchanged:: 0.22 |
| The default value of ``n_estimators`` changed from 10 to 100 |
| in 0.22. |
| |
| max_depth : int, default=5 |
| The maximum depth of each tree. If None, then nodes are expanded until |
| all leaves are pure or until all leaves contain less than |
| min_samples_split samples. |
| |
| min_samples_split : int or float, default=2 |
| The minimum number of samples required to split an internal node: |
| |
| - If int, then consider `min_samples_split` as the minimum number. |
| - If float, then `min_samples_split` is a fraction and |
| `ceil(min_samples_split * n_samples)` is the minimum |
| number of samples for each split. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_samples_leaf : int or float, default=1 |
| The minimum number of samples required to be at a leaf node. |
| A split point at any depth will only be considered if it leaves at |
| least ``min_samples_leaf`` training samples in each of the left and |
| right branches. This may have the effect of smoothing the model, |
| especially in regression. |
| |
| - If int, then consider `min_samples_leaf` as the minimum number. |
| - If float, then `min_samples_leaf` is a fraction and |
| `ceil(min_samples_leaf * n_samples)` is the minimum |
| number of samples for each node. |
| |
| .. versionchanged:: 0.18 |
| Added float values for fractions. |
| |
| min_weight_fraction_leaf : float, default=0.0 |
| The minimum weighted fraction of the sum total of weights (of all |
| the input samples) required to be at a leaf node. Samples have |
| equal weight when sample_weight is not provided. |
| |
| max_leaf_nodes : int, default=None |
| Grow trees with ``max_leaf_nodes`` in best-first fashion. |
| Best nodes are defined as relative reduction in impurity. |
| If None then unlimited number of leaf nodes. |
| |
| min_impurity_decrease : float, default=0.0 |
| A node will be split if this split induces a decrease of the impurity |
| greater than or equal to this value. |
| |
| The weighted impurity decrease equation is the following:: |
| |
| N_t / N * (impurity - N_t_R / N_t * right_impurity |
| - N_t_L / N_t * left_impurity) |
| |
| where ``N`` is the total number of samples, ``N_t`` is the number of |
| samples at the current node, ``N_t_L`` is the number of samples in the |
| left child, and ``N_t_R`` is the number of samples in the right child. |
| |
| ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, |
| if ``sample_weight`` is passed. |
| |
| .. versionadded:: 0.19 |
| |
| sparse_output : bool, default=True |
| Whether or not to return a sparse CSR matrix, as default behavior, |
| or to return a dense array compatible with dense pipeline operators. |
| |
| n_jobs : int, default=None |
| The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`, |
| :meth:`decision_path` and :meth:`apply` are all parallelized over the |
| trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` |
| context. ``-1`` means using all processors. See :term:`Glossary |
| <n_jobs>` for more details. |
| |
| random_state : int, RandomState instance or None, default=None |
| Controls the generation of the random `y` used to fit the trees |
| and the draw of the splits for each feature at the trees' nodes. |
| See :term:`Glossary <random_state>` for details. |
| |
| verbose : int, default=0 |
| Controls the verbosity when fitting and predicting. |
| |
| warm_start : bool, default=False |
| When set to ``True``, reuse the solution of the previous call to fit |
| and add more estimators to the ensemble, otherwise, just fit a whole |
| new forest. See :term:`Glossary <warm_start>` and |
| :ref:`tree_ensemble_warm_start` for details. |
| |
| Attributes |
| ---------- |
| estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance |
| The child estimator template used to create the collection of fitted |
| sub-estimators. |
| |
| .. versionadded:: 1.2 |
| `base_estimator_` was renamed to `estimator_`. |
| |
| estimators_ : list of :class:`~sklearn.tree.ExtraTreeRegressor` instances |
| The collection of fitted sub-estimators. |
| |
| feature_importances_ : ndarray of shape (n_features,) |
| The feature importances (the higher, the more important the feature). |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| n_outputs_ : int |
| The number of outputs when ``fit`` is performed. |
| |
| one_hot_encoder_ : OneHotEncoder instance |
| One-hot encoder used to create the sparse embedding. |
| |
| estimators_samples_ : list of arrays |
| The subset of drawn samples (i.e., the in-bag samples) for each base |
| estimator. Each subset is defined by an array of the indices selected. |
| |
| .. versionadded:: 1.4 |
| |
| See Also |
| -------- |
| ExtraTreesClassifier : An extra-trees classifier. |
| ExtraTreesRegressor : An extra-trees regressor. |
| RandomForestClassifier : A random forest classifier. |
| RandomForestRegressor : A random forest regressor. |
| sklearn.tree.ExtraTreeClassifier: An extremely randomized |
| tree classifier. |
| sklearn.tree.ExtraTreeRegressor : An extremely randomized |
| tree regressor. |
| |
| References |
| ---------- |
| .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", |
| Machine Learning, 63(1), 3-42, 2006. |
| .. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative |
| visual codebooks using randomized clustering forests" |
| NIPS 2007 |
| |
| Examples |
| -------- |
| >>> from sklearn.ensemble import RandomTreesEmbedding |
| >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]] |
| >>> random_trees = RandomTreesEmbedding( |
| ... n_estimators=5, random_state=0, max_depth=1).fit(X) |
| >>> X_sparse_embedding = random_trees.transform(X) |
| >>> X_sparse_embedding.toarray() |
| array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], |
| [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], |
| [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.], |
| [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.], |
| [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]]) |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_estimators": [Interval(Integral, 1, None, closed="left")], |
| "n_jobs": [Integral, None], |
| "verbose": ["verbose"], |
| "warm_start": ["boolean"], |
| **BaseDecisionTree._parameter_constraints, |
| "sparse_output": ["boolean"], |
| } |
| for param in ("max_features", "ccp_alpha", "splitter", "monotonic_cst"): |
| _parameter_constraints.pop(param) |
|
|
| criterion = "squared_error" |
| max_features = 1 |
|
|
| def __init__( |
| self, |
| n_estimators=100, |
| *, |
| max_depth=5, |
| min_samples_split=2, |
| min_samples_leaf=1, |
| min_weight_fraction_leaf=0.0, |
| max_leaf_nodes=None, |
| min_impurity_decrease=0.0, |
| sparse_output=True, |
| n_jobs=None, |
| random_state=None, |
| verbose=0, |
| warm_start=False, |
| ): |
| super().__init__( |
| estimator=ExtraTreeRegressor(), |
| n_estimators=n_estimators, |
| estimator_params=( |
| "criterion", |
| "max_depth", |
| "min_samples_split", |
| "min_samples_leaf", |
| "min_weight_fraction_leaf", |
| "max_features", |
| "max_leaf_nodes", |
| "min_impurity_decrease", |
| "random_state", |
| ), |
| bootstrap=False, |
| oob_score=False, |
| n_jobs=n_jobs, |
| random_state=random_state, |
| verbose=verbose, |
| warm_start=warm_start, |
| max_samples=None, |
| ) |
|
|
| self.max_depth = max_depth |
| self.min_samples_split = min_samples_split |
| self.min_samples_leaf = min_samples_leaf |
| self.min_weight_fraction_leaf = min_weight_fraction_leaf |
| self.max_leaf_nodes = max_leaf_nodes |
| self.min_impurity_decrease = min_impurity_decrease |
| self.sparse_output = sparse_output |
|
|
| def _set_oob_score_and_attributes(self, X, y, scoring_function=None): |
| raise NotImplementedError("OOB score not supported by tree embedding") |
|
|
| def fit(self, X, y=None, sample_weight=None): |
| """ |
| Fit estimator. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The input samples. Use ``dtype=np.float32`` for maximum |
| efficiency. Sparse matrices are also supported, use sparse |
| ``csc_matrix`` for maximum efficiency. |
| |
| y : Ignored |
| Not used, present for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If None, then samples are equally weighted. Splits |
| that would create child nodes with net zero or negative weight are |
| ignored while searching for a split in each node. In the case of |
| classification, splits are also ignored if they would result in any |
| single class carrying a negative weight in either child node. |
| |
| Returns |
| ------- |
| self : object |
| Returns the instance itself. |
| """ |
| |
| self.fit_transform(X, y, sample_weight=sample_weight) |
| return self |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit_transform(self, X, y=None, sample_weight=None): |
| """ |
| Fit estimator and transform dataset. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Input data used to build forests. Use ``dtype=np.float32`` for |
| maximum efficiency. |
| |
| y : Ignored |
| Not used, present for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| Sample weights. If None, then samples are equally weighted. Splits |
| that would create child nodes with net zero or negative weight are |
| ignored while searching for a split in each node. In the case of |
| classification, splits are also ignored if they would result in any |
| single class carrying a negative weight in either child node. |
| |
| Returns |
| ------- |
| X_transformed : sparse matrix of shape (n_samples, n_out) |
| Transformed dataset. |
| """ |
| rnd = check_random_state(self.random_state) |
| y = rnd.uniform(size=_num_samples(X)) |
| super().fit(X, y, sample_weight=sample_weight) |
|
|
| self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) |
| output = self.one_hot_encoder_.fit_transform(self.apply(X)) |
| self._n_features_out = output.shape[1] |
| return output |
|
|
| def get_feature_names_out(self, input_features=None): |
| """Get output feature names for transformation. |
| |
| Parameters |
| ---------- |
| input_features : array-like of str or None, default=None |
| Only used to validate feature names with the names seen in :meth:`fit`. |
| |
| Returns |
| ------- |
| feature_names_out : ndarray of str objects |
| Transformed feature names, in the format of |
| `randomtreesembedding_{tree}_{leaf}`, where `tree` is the tree used |
| to generate the leaf and `leaf` is the index of a leaf node |
| in that tree. Note that the node indexing scheme is used to |
| index both nodes with children (split nodes) and leaf nodes. |
| Only the latter can be present as output features. |
| As a consequence, there are missing indices in the output |
| feature names. |
| """ |
| check_is_fitted(self, "_n_features_out") |
| _check_feature_names_in( |
| self, input_features=input_features, generate_names=False |
| ) |
|
|
| feature_names = [ |
| f"randomtreesembedding_{tree}_{leaf}" |
| for tree in range(self.n_estimators) |
| for leaf in self.one_hot_encoder_.categories_[tree] |
| ] |
| return np.asarray(feature_names, dtype=object) |
|
|
| def transform(self, X): |
| """ |
| Transform dataset. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Input data to be transformed. Use ``dtype=np.float32`` for maximum |
| efficiency. Sparse matrices are also supported, use sparse |
| ``csr_matrix`` for maximum efficiency. |
| |
| Returns |
| ------- |
| X_transformed : sparse matrix of shape (n_samples, n_out) |
| Transformed dataset. |
| """ |
| check_is_fitted(self) |
| return self.one_hot_encoder_.transform(self.apply(X)) |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.sparse = True |
| return tags |
|
|