| """Ordering Points To Identify the Clustering Structure (OPTICS) |
| |
| These routines execute the OPTICS algorithm, and implement various |
| cluster extraction methods of the ordered list. |
| |
| Authors: Shane Grigsby <refuge@rocktalus.com> |
| Adrin Jalali <adrinjalali@gmail.com> |
| Erich Schubert <erich@debian.org> |
| Hanmin Qin <qinhanmin2005@sina.com> |
| License: BSD 3 clause |
| """ |
|
|
| |
| |
|
|
| import warnings |
| from numbers import Integral, Real |
|
|
| import numpy as np |
| from scipy.sparse import SparseEfficiencyWarning, issparse |
|
|
| from ..base import BaseEstimator, ClusterMixin, _fit_context |
| from ..exceptions import DataConversionWarning |
| from ..metrics import pairwise_distances |
| from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS |
| from ..neighbors import NearestNeighbors |
| from ..utils import gen_batches |
| from ..utils._chunking import get_chunk_n_rows |
| from ..utils._param_validation import ( |
| HasMethods, |
| Interval, |
| RealNotInt, |
| StrOptions, |
| validate_params, |
| ) |
| from ..utils.validation import check_memory, validate_data |
|
|
|
|
| class OPTICS(ClusterMixin, BaseEstimator): |
| """Estimate clustering structure from vector array. |
| |
| OPTICS (Ordering Points To Identify the Clustering Structure), closely |
| related to DBSCAN, finds core sample of high density and expands clusters |
| from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable |
| neighborhood radius. Better suited for usage on large datasets than the |
| current sklearn implementation of DBSCAN. |
| |
| Clusters are then extracted using a DBSCAN-like method |
| (cluster_method = 'dbscan') or an automatic |
| technique proposed in [1]_ (cluster_method = 'xi'). |
| |
| This implementation deviates from the original OPTICS by first performing |
| k-nearest-neighborhood searches on all points to identify core sizes, then |
| computing only the distances to unprocessed points when constructing the |
| cluster order. Note that we do not employ a heap to manage the expansion |
| candidates, so the time complexity will be O(n^2). |
| |
| Read more in the :ref:`User Guide <optics>`. |
| |
| Parameters |
| ---------- |
| min_samples : int > 1 or float between 0 and 1, default=5 |
| The number of samples in a neighborhood for a point to be considered as |
| a core point. Also, up and down steep regions can't have more than |
| ``min_samples`` consecutive non-steep points. Expressed as an absolute |
| number or a fraction of the number of samples (rounded to be at least |
| 2). |
| |
| max_eps : float, default=np.inf |
| The maximum distance between two samples for one to be considered as |
| in the neighborhood of the other. Default value of ``np.inf`` will |
| identify clusters across all scales; reducing ``max_eps`` will result |
| in shorter run times. |
| |
| metric : str or callable, default='minkowski' |
| Metric to use for distance computation. Any metric from scikit-learn |
| or scipy.spatial.distance can be used. |
| |
| If metric is a callable function, it is called on each |
| pair of instances (rows) and the resulting value recorded. The callable |
| should take two arrays as input and return one value indicating the |
| distance between them. This works for Scipy's metrics, but is less |
| efficient than passing the metric name as a string. If metric is |
| "precomputed", `X` is assumed to be a distance matrix and must be |
| square. |
| |
| Valid values for metric are: |
| |
| - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
| 'manhattan'] |
| |
| - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
| 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', |
| 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', |
| 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', |
| 'yule'] |
| |
| Sparse matrices are only supported by scikit-learn metrics. |
| See the documentation for scipy.spatial.distance for details on these |
| metrics. |
| |
| .. note:: |
| `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. |
| |
| p : float, default=2 |
| Parameter for the Minkowski metric from |
| :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is |
| equivalent to using manhattan_distance (l1), and euclidean_distance |
| (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
| |
| metric_params : dict, default=None |
| Additional keyword arguments for the metric function. |
| |
| cluster_method : str, default='xi' |
| The extraction method used to extract clusters using the calculated |
| reachability and ordering. Possible values are "xi" and "dbscan". |
| |
| eps : float, default=None |
| The maximum distance between two samples for one to be considered as |
| in the neighborhood of the other. By default it assumes the same value |
| as ``max_eps``. |
| Used only when ``cluster_method='dbscan'``. |
| |
| xi : float between 0 and 1, default=0.05 |
| Determines the minimum steepness on the reachability plot that |
| constitutes a cluster boundary. For example, an upwards point in the |
| reachability plot is defined by the ratio from one point to its |
| successor being at most 1-xi. |
| Used only when ``cluster_method='xi'``. |
| |
| predecessor_correction : bool, default=True |
| Correct clusters according to the predecessors calculated by OPTICS |
| [2]_. This parameter has minimal effect on most datasets. |
| Used only when ``cluster_method='xi'``. |
| |
| min_cluster_size : int > 1 or float between 0 and 1, default=None |
| Minimum number of samples in an OPTICS cluster, expressed as an |
| absolute number or a fraction of the number of samples (rounded to be |
| at least 2). If ``None``, the value of ``min_samples`` is used instead. |
| Used only when ``cluster_method='xi'``. |
| |
| algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
| Algorithm used to compute the nearest neighbors: |
| |
| - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`. |
| - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`. |
| - 'brute' will use a brute-force search. |
| - 'auto' (default) will attempt to decide the most appropriate |
| algorithm based on the values passed to :meth:`fit` method. |
| |
| Note: fitting on sparse input will override the setting of |
| this parameter, using brute force. |
| |
| leaf_size : int, default=30 |
| Leaf size passed to :class:`~sklearn.neighbors.BallTree` or |
| :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the |
| construction and query, as well as the memory required to store the |
| tree. The optimal value depends on the nature of the problem. |
| |
| memory : str or object with the joblib.Memory interface, default=None |
| Used to cache the output of the computation of the tree. |
| By default, no caching is done. If a string is given, it is the |
| path to the caching directory. |
| |
| n_jobs : int, default=None |
| The number of parallel jobs to run for neighbors search. |
| ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
| ``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
| for more details. |
| |
| Attributes |
| ---------- |
| labels_ : ndarray of shape (n_samples,) |
| Cluster labels for each point in the dataset given to fit(). |
| Noisy samples and points which are not included in a leaf cluster |
| of ``cluster_hierarchy_`` are labeled as -1. |
| |
| reachability_ : ndarray of shape (n_samples,) |
| Reachability distances per sample, indexed by object order. Use |
| ``clust.reachability_[clust.ordering_]`` to access in cluster order. |
| |
| ordering_ : ndarray of shape (n_samples,) |
| The cluster ordered list of sample indices. |
| |
| core_distances_ : ndarray of shape (n_samples,) |
| Distance at which each sample becomes a core point, indexed by object |
| order. Points which will never be core have a distance of inf. Use |
| ``clust.core_distances_[clust.ordering_]`` to access in cluster order. |
| |
| predecessor_ : ndarray of shape (n_samples,) |
| Point that a sample was reached from, indexed by object order. |
| Seed points have a predecessor of -1. |
| |
| cluster_hierarchy_ : ndarray of shape (n_clusters, 2) |
| The list of clusters in the form of ``[start, end]`` in each row, with |
| all indices inclusive. The clusters are ordered according to |
| ``(end, -start)`` (ascending) so that larger clusters encompassing |
| smaller clusters come after those smaller ones. Since ``labels_`` does |
| not reflect the hierarchy, usually |
| ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also |
| note that these indices are of the ``ordering_``, i.e. |
| ``X[ordering_][start:end + 1]`` form a cluster. |
| Only available when ``cluster_method='xi'``. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| See Also |
| -------- |
| DBSCAN : A similar clustering for a specified neighborhood radius (eps). |
| Our implementation is optimized for runtime. |
| |
| References |
| ---------- |
| .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, |
| and Jörg Sander. "OPTICS: ordering points to identify the clustering |
| structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. |
| |
| .. [2] Schubert, Erich, Michael Gertz. |
| "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of |
| the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. |
| |
| Examples |
| -------- |
| >>> from sklearn.cluster import OPTICS |
| >>> import numpy as np |
| >>> X = np.array([[1, 2], [2, 5], [3, 6], |
| ... [8, 7], [8, 8], [7, 3]]) |
| >>> clustering = OPTICS(min_samples=2).fit(X) |
| >>> clustering.labels_ |
| array([0, 0, 0, 1, 1, 1]) |
| |
| For a more detailed example see |
| :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`. |
| """ |
|
|
| _parameter_constraints: dict = { |
| "min_samples": [ |
| Interval(Integral, 2, None, closed="left"), |
| Interval(RealNotInt, 0, 1, closed="both"), |
| ], |
| "max_eps": [Interval(Real, 0, None, closed="both")], |
| "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
| "p": [Interval(Real, 1, None, closed="left")], |
| "metric_params": [dict, None], |
| "cluster_method": [StrOptions({"dbscan", "xi"})], |
| "eps": [Interval(Real, 0, None, closed="both"), None], |
| "xi": [Interval(Real, 0, 1, closed="both")], |
| "predecessor_correction": ["boolean"], |
| "min_cluster_size": [ |
| Interval(Integral, 2, None, closed="left"), |
| Interval(RealNotInt, 0, 1, closed="right"), |
| None, |
| ], |
| "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})], |
| "leaf_size": [Interval(Integral, 1, None, closed="left")], |
| "memory": [str, HasMethods("cache"), None], |
| "n_jobs": [Integral, None], |
| } |
|
|
| def __init__( |
| self, |
| *, |
| min_samples=5, |
| max_eps=np.inf, |
| metric="minkowski", |
| p=2, |
| metric_params=None, |
| cluster_method="xi", |
| eps=None, |
| xi=0.05, |
| predecessor_correction=True, |
| min_cluster_size=None, |
| algorithm="auto", |
| leaf_size=30, |
| memory=None, |
| n_jobs=None, |
| ): |
| self.max_eps = max_eps |
| self.min_samples = min_samples |
| self.min_cluster_size = min_cluster_size |
| self.algorithm = algorithm |
| self.metric = metric |
| self.metric_params = metric_params |
| self.p = p |
| self.leaf_size = leaf_size |
| self.cluster_method = cluster_method |
| self.eps = eps |
| self.xi = xi |
| self.predecessor_correction = predecessor_correction |
| self.memory = memory |
| self.n_jobs = n_jobs |
|
|
| @_fit_context( |
| |
| prefer_skip_nested_validation=False |
| ) |
| def fit(self, X, y=None): |
| """Perform OPTICS clustering. |
| |
| Extracts an ordered list of points and reachability distances, and |
| performs initial clustering using ``max_eps`` distance specified at |
| OPTICS object instantiation. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \ |
| (n_samples, n_samples) if metric='precomputed' |
| A feature array, or array of distances between samples if |
| metric='precomputed'. If a sparse matrix is provided, it will be |
| converted into CSR format. |
| |
| y : Ignored |
| Not used, present for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Returns a fitted instance of self. |
| """ |
| dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float |
| if dtype is bool and X.dtype != bool: |
| msg = ( |
| "Data will be converted to boolean for" |
| f" metric {self.metric}, to avoid this warning," |
| " you may convert the data prior to calling fit." |
| ) |
| warnings.warn(msg, DataConversionWarning) |
|
|
| X = validate_data(self, X, dtype=dtype, accept_sparse="csr") |
| if self.metric == "precomputed" and issparse(X): |
| X = X.copy() |
| with warnings.catch_warnings(): |
| warnings.simplefilter("ignore", SparseEfficiencyWarning) |
| |
| |
| X.setdiag(X.diagonal()) |
| memory = check_memory(self.memory) |
|
|
| ( |
| self.ordering_, |
| self.core_distances_, |
| self.reachability_, |
| self.predecessor_, |
| ) = memory.cache(compute_optics_graph)( |
| X=X, |
| min_samples=self.min_samples, |
| algorithm=self.algorithm, |
| leaf_size=self.leaf_size, |
| metric=self.metric, |
| metric_params=self.metric_params, |
| p=self.p, |
| n_jobs=self.n_jobs, |
| max_eps=self.max_eps, |
| ) |
|
|
| |
| if self.cluster_method == "xi": |
| labels_, clusters_ = cluster_optics_xi( |
| reachability=self.reachability_, |
| predecessor=self.predecessor_, |
| ordering=self.ordering_, |
| min_samples=self.min_samples, |
| min_cluster_size=self.min_cluster_size, |
| xi=self.xi, |
| predecessor_correction=self.predecessor_correction, |
| ) |
| self.cluster_hierarchy_ = clusters_ |
| elif self.cluster_method == "dbscan": |
| if self.eps is None: |
| eps = self.max_eps |
| else: |
| eps = self.eps |
|
|
| if eps > self.max_eps: |
| raise ValueError( |
| "Specify an epsilon smaller than %s. Got %s." % (self.max_eps, eps) |
| ) |
|
|
| labels_ = cluster_optics_dbscan( |
| reachability=self.reachability_, |
| core_distances=self.core_distances_, |
| ordering=self.ordering_, |
| eps=eps, |
| ) |
|
|
| self.labels_ = labels_ |
| return self |
|
|
|
|
| def _validate_size(size, n_samples, param_name): |
| if size > n_samples: |
| raise ValueError( |
| "%s must be no greater than the number of samples (%d). Got %d" |
| % (param_name, n_samples, size) |
| ) |
|
|
|
|
| |
| def _compute_core_distances_(X, neighbors, min_samples, working_memory): |
| """Compute the k-th nearest neighbor of each sample. |
| |
| Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] |
| but with more memory efficiency. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data. |
| neighbors : NearestNeighbors instance |
| The fitted nearest neighbors estimator. |
| working_memory : int, default=None |
| The sought maximum memory for temporary distance matrix chunks. |
| When None (default), the value of |
| ``sklearn.get_config()['working_memory']`` is used. |
| |
| Returns |
| ------- |
| core_distances : ndarray of shape (n_samples,) |
| Distance at which each sample becomes a core point. |
| Points which will never be core have a distance of inf. |
| """ |
| n_samples = X.shape[0] |
| core_distances = np.empty(n_samples) |
| core_distances.fill(np.nan) |
|
|
| chunk_n_rows = get_chunk_n_rows( |
| row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory |
| ) |
| slices = gen_batches(n_samples, chunk_n_rows) |
| for sl in slices: |
| core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1] |
| return core_distances |
|
|
|
|
| @validate_params( |
| { |
| "X": [np.ndarray, "sparse matrix"], |
| "min_samples": [ |
| Interval(Integral, 2, None, closed="left"), |
| Interval(RealNotInt, 0, 1, closed="both"), |
| ], |
| "max_eps": [Interval(Real, 0, None, closed="both")], |
| "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
| "p": [Interval(Real, 0, None, closed="right"), None], |
| "metric_params": [dict, None], |
| "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})], |
| "leaf_size": [Interval(Integral, 1, None, closed="left")], |
| "n_jobs": [Integral, None], |
| }, |
| prefer_skip_nested_validation=False, |
| ) |
| def compute_optics_graph( |
| X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs |
| ): |
| """Compute the OPTICS reachability graph. |
| |
| Read more in the :ref:`User Guide <optics>`. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \ |
| (n_samples, n_samples) if metric='precomputed' |
| A feature array, or array of distances between samples if |
| metric='precomputed'. |
| |
| min_samples : int > 1 or float between 0 and 1 |
| The number of samples in a neighborhood for a point to be considered |
| as a core point. Expressed as an absolute number or a fraction of the |
| number of samples (rounded to be at least 2). |
| |
| max_eps : float, default=np.inf |
| The maximum distance between two samples for one to be considered as |
| in the neighborhood of the other. Default value of ``np.inf`` will |
| identify clusters across all scales; reducing ``max_eps`` will result |
| in shorter run times. |
| |
| metric : str or callable, default='minkowski' |
| Metric to use for distance computation. Any metric from scikit-learn |
| or scipy.spatial.distance can be used. |
| |
| If metric is a callable function, it is called on each |
| pair of instances (rows) and the resulting value recorded. The callable |
| should take two arrays as input and return one value indicating the |
| distance between them. This works for Scipy's metrics, but is less |
| efficient than passing the metric name as a string. If metric is |
| "precomputed", X is assumed to be a distance matrix and must be square. |
| |
| Valid values for metric are: |
| |
| - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
| 'manhattan'] |
| |
| - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
| 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', |
| 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', |
| 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', |
| 'yule'] |
| |
| See the documentation for scipy.spatial.distance for details on these |
| metrics. |
| |
| .. note:: |
| `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. |
| |
| p : float, default=2 |
| Parameter for the Minkowski metric from |
| :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is |
| equivalent to using manhattan_distance (l1), and euclidean_distance |
| (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
| |
| metric_params : dict, default=None |
| Additional keyword arguments for the metric function. |
| |
| algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
| Algorithm used to compute the nearest neighbors: |
| |
| - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`. |
| - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`. |
| - 'brute' will use a brute-force search. |
| - 'auto' will attempt to decide the most appropriate algorithm |
| based on the values passed to `fit` method. (default) |
| |
| Note: fitting on sparse input will override the setting of |
| this parameter, using brute force. |
| |
| leaf_size : int, default=30 |
| Leaf size passed to :class:`~sklearn.neighbors.BallTree` or |
| :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the |
| construction and query, as well as the memory required to store the |
| tree. The optimal value depends on the nature of the problem. |
| |
| n_jobs : int, default=None |
| The number of parallel jobs to run for neighbors search. |
| ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
| ``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
| for more details. |
| |
| Returns |
| ------- |
| ordering_ : array of shape (n_samples,) |
| The cluster ordered list of sample indices. |
| |
| core_distances_ : array of shape (n_samples,) |
| Distance at which each sample becomes a core point, indexed by object |
| order. Points which will never be core have a distance of inf. Use |
| ``clust.core_distances_[clust.ordering_]`` to access in cluster order. |
| |
| reachability_ : array of shape (n_samples,) |
| Reachability distances per sample, indexed by object order. Use |
| ``clust.reachability_[clust.ordering_]`` to access in cluster order. |
| |
| predecessor_ : array of shape (n_samples,) |
| Point that a sample was reached from, indexed by object order. |
| Seed points have a predecessor of -1. |
| |
| References |
| ---------- |
| .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, |
| and Jörg Sander. "OPTICS: ordering points to identify the clustering |
| structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.cluster import compute_optics_graph |
| >>> X = np.array([[1, 2], [2, 5], [3, 6], |
| ... [8, 7], [8, 8], [7, 3]]) |
| >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( |
| ... X, |
| ... min_samples=2, |
| ... max_eps=np.inf, |
| ... metric="minkowski", |
| ... p=2, |
| ... metric_params=None, |
| ... algorithm="auto", |
| ... leaf_size=30, |
| ... n_jobs=None, |
| ... ) |
| >>> ordering |
| array([0, 1, 2, 5, 3, 4]) |
| >>> core_distances |
| array([3.16..., 1.41..., 1.41..., 1. , 1. , |
| 4.12...]) |
| >>> reachability |
| array([ inf, 3.16..., 1.41..., 4.12..., 1. , |
| 5. ]) |
| >>> predecessor |
| array([-1, 0, 1, 5, 3, 2]) |
| """ |
| n_samples = X.shape[0] |
| _validate_size(min_samples, n_samples, "min_samples") |
| if min_samples <= 1: |
| min_samples = max(2, int(min_samples * n_samples)) |
|
|
| |
| reachability_ = np.empty(n_samples) |
| reachability_.fill(np.inf) |
| predecessor_ = np.empty(n_samples, dtype=int) |
| predecessor_.fill(-1) |
|
|
| nbrs = NearestNeighbors( |
| n_neighbors=min_samples, |
| algorithm=algorithm, |
| leaf_size=leaf_size, |
| metric=metric, |
| metric_params=metric_params, |
| p=p, |
| n_jobs=n_jobs, |
| ) |
|
|
| nbrs.fit(X) |
| |
| |
| |
| core_distances_ = _compute_core_distances_( |
| X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None |
| ) |
| |
| core_distances_[core_distances_ > max_eps] = np.inf |
| np.around( |
| core_distances_, |
| decimals=np.finfo(core_distances_.dtype).precision, |
| out=core_distances_, |
| ) |
|
|
| |
| |
| |
| |
| processed = np.zeros(X.shape[0], dtype=bool) |
| ordering = np.zeros(X.shape[0], dtype=int) |
| for ordering_idx in range(X.shape[0]): |
| |
| |
| index = np.where(processed == 0)[0] |
| point = index[np.argmin(reachability_[index])] |
|
|
| processed[point] = True |
| ordering[ordering_idx] = point |
| if core_distances_[point] != np.inf: |
| _set_reach_dist( |
| core_distances_=core_distances_, |
| reachability_=reachability_, |
| predecessor_=predecessor_, |
| point_index=point, |
| processed=processed, |
| X=X, |
| nbrs=nbrs, |
| metric=metric, |
| metric_params=metric_params, |
| p=p, |
| max_eps=max_eps, |
| ) |
| if np.all(np.isinf(reachability_)): |
| warnings.warn( |
| ( |
| "All reachability values are inf. Set a larger" |
| " max_eps or all data will be considered outliers." |
| ), |
| UserWarning, |
| ) |
| return ordering, core_distances_, reachability_, predecessor_ |
|
|
|
|
| def _set_reach_dist( |
| core_distances_, |
| reachability_, |
| predecessor_, |
| point_index, |
| processed, |
| X, |
| nbrs, |
| metric, |
| metric_params, |
| p, |
| max_eps, |
| ): |
| P = X[point_index : point_index + 1] |
| |
| |
| |
| indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] |
|
|
| |
| unproc = np.compress(~np.take(processed, indices), indices) |
| |
| if not unproc.size: |
| return |
|
|
| |
| if metric == "precomputed": |
| dists = X[[point_index], unproc] |
| if isinstance(dists, np.matrix): |
| dists = np.asarray(dists) |
| dists = dists.ravel() |
| else: |
| _params = dict() if metric_params is None else metric_params.copy() |
| if metric == "minkowski" and "p" not in _params: |
| |
| |
| _params["p"] = p |
| dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() |
|
|
| rdists = np.maximum(dists, core_distances_[point_index]) |
| np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) |
| improved = np.where(rdists < np.take(reachability_, unproc)) |
| reachability_[unproc[improved]] = rdists[improved] |
| predecessor_[unproc[improved]] = point_index |
|
|
|
|
| @validate_params( |
| { |
| "reachability": [np.ndarray], |
| "core_distances": [np.ndarray], |
| "ordering": [np.ndarray], |
| "eps": [Interval(Real, 0, None, closed="both")], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps): |
| """Perform DBSCAN extraction for an arbitrary epsilon. |
| |
| Extracting the clusters runs in linear time. Note that this results in |
| ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with |
| similar settings and ``eps``, only if ``eps`` is close to ``max_eps``. |
| |
| Parameters |
| ---------- |
| reachability : ndarray of shape (n_samples,) |
| Reachability distances calculated by OPTICS (``reachability_``). |
| |
| core_distances : ndarray of shape (n_samples,) |
| Distances at which points become core (``core_distances_``). |
| |
| ordering : ndarray of shape (n_samples,) |
| OPTICS ordered point indices (``ordering_``). |
| |
| eps : float |
| DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results |
| will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close |
| to one another. |
| |
| Returns |
| ------- |
| labels_ : array of shape (n_samples,) |
| The estimated labels. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.cluster import cluster_optics_dbscan, compute_optics_graph |
| >>> X = np.array([[1, 2], [2, 5], [3, 6], |
| ... [8, 7], [8, 8], [7, 3]]) |
| >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( |
| ... X, |
| ... min_samples=2, |
| ... max_eps=np.inf, |
| ... metric="minkowski", |
| ... p=2, |
| ... metric_params=None, |
| ... algorithm="auto", |
| ... leaf_size=30, |
| ... n_jobs=None, |
| ... ) |
| >>> eps = 4.5 |
| >>> labels = cluster_optics_dbscan( |
| ... reachability=reachability, |
| ... core_distances=core_distances, |
| ... ordering=ordering, |
| ... eps=eps, |
| ... ) |
| >>> labels |
| array([0, 0, 0, 1, 1, 1]) |
| """ |
| n_samples = len(core_distances) |
| labels = np.zeros(n_samples, dtype=int) |
|
|
| far_reach = reachability > eps |
| near_core = core_distances <= eps |
| labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1 |
| labels[far_reach & ~near_core] = -1 |
| return labels |
|
|
|
|
| @validate_params( |
| { |
| "reachability": [np.ndarray], |
| "predecessor": [np.ndarray], |
| "ordering": [np.ndarray], |
| "min_samples": [ |
| Interval(Integral, 2, None, closed="left"), |
| Interval(RealNotInt, 0, 1, closed="both"), |
| ], |
| "min_cluster_size": [ |
| Interval(Integral, 2, None, closed="left"), |
| Interval(RealNotInt, 0, 1, closed="both"), |
| None, |
| ], |
| "xi": [Interval(Real, 0, 1, closed="both")], |
| "predecessor_correction": ["boolean"], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def cluster_optics_xi( |
| *, |
| reachability, |
| predecessor, |
| ordering, |
| min_samples, |
| min_cluster_size=None, |
| xi=0.05, |
| predecessor_correction=True, |
| ): |
| """Automatically extract clusters according to the Xi-steep method. |
| |
| Parameters |
| ---------- |
| reachability : ndarray of shape (n_samples,) |
| Reachability distances calculated by OPTICS (`reachability_`). |
| |
| predecessor : ndarray of shape (n_samples,) |
| Predecessors calculated by OPTICS. |
| |
| ordering : ndarray of shape (n_samples,) |
| OPTICS ordered point indices (`ordering_`). |
| |
| min_samples : int > 1 or float between 0 and 1 |
| The same as the min_samples given to OPTICS. Up and down steep regions |
| can't have more then ``min_samples`` consecutive non-steep points. |
| Expressed as an absolute number or a fraction of the number of samples |
| (rounded to be at least 2). |
| |
| min_cluster_size : int > 1 or float between 0 and 1, default=None |
| Minimum number of samples in an OPTICS cluster, expressed as an |
| absolute number or a fraction of the number of samples (rounded to be |
| at least 2). If ``None``, the value of ``min_samples`` is used instead. |
| |
| xi : float between 0 and 1, default=0.05 |
| Determines the minimum steepness on the reachability plot that |
| constitutes a cluster boundary. For example, an upwards point in the |
| reachability plot is defined by the ratio from one point to its |
| successor being at most 1-xi. |
| |
| predecessor_correction : bool, default=True |
| Correct clusters based on the calculated predecessors. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| The labels assigned to samples. Points which are not included |
| in any cluster are labeled as -1. |
| |
| clusters : ndarray of shape (n_clusters, 2) |
| The list of clusters in the form of ``[start, end]`` in each row, with |
| all indices inclusive. The clusters are ordered according to ``(end, |
| -start)`` (ascending) so that larger clusters encompassing smaller |
| clusters come after such nested smaller clusters. Since ``labels`` does |
| not reflect the hierarchy, usually ``len(clusters) > |
| np.unique(labels)``. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.cluster import cluster_optics_xi, compute_optics_graph |
| >>> X = np.array([[1, 2], [2, 5], [3, 6], |
| ... [8, 7], [8, 8], [7, 3]]) |
| >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( |
| ... X, |
| ... min_samples=2, |
| ... max_eps=np.inf, |
| ... metric="minkowski", |
| ... p=2, |
| ... metric_params=None, |
| ... algorithm="auto", |
| ... leaf_size=30, |
| ... n_jobs=None |
| ... ) |
| >>> min_samples = 2 |
| >>> labels, clusters = cluster_optics_xi( |
| ... reachability=reachability, |
| ... predecessor=predecessor, |
| ... ordering=ordering, |
| ... min_samples=min_samples, |
| ... ) |
| >>> labels |
| array([0, 0, 0, 1, 1, 1]) |
| >>> clusters |
| array([[0, 2], |
| [3, 5], |
| [0, 5]]) |
| """ |
| n_samples = len(reachability) |
| _validate_size(min_samples, n_samples, "min_samples") |
| if min_samples <= 1: |
| min_samples = max(2, int(min_samples * n_samples)) |
| if min_cluster_size is None: |
| min_cluster_size = min_samples |
| _validate_size(min_cluster_size, n_samples, "min_cluster_size") |
| if min_cluster_size <= 1: |
| min_cluster_size = max(2, int(min_cluster_size * n_samples)) |
|
|
| clusters = _xi_cluster( |
| reachability[ordering], |
| predecessor[ordering], |
| ordering, |
| xi, |
| min_samples, |
| min_cluster_size, |
| predecessor_correction, |
| ) |
| labels = _extract_xi_labels(ordering, clusters) |
| return labels, clusters |
|
|
|
|
| def _extend_region(steep_point, xward_point, start, min_samples): |
| """Extend the area until it's maximal. |
| |
| It's the same function for both upward and downward reagions, depending on |
| the given input parameters. Assuming: |
| |
| - steep_{upward/downward}: bool array indicating whether a point is a |
| steep {upward/downward}; |
| - upward/downward: bool array indicating whether a point is |
| upward/downward; |
| |
| To extend an upward reagion, ``steep_point=steep_upward`` and |
| ``xward_point=downward`` are expected, and to extend a downward region, |
| ``steep_point=steep_downward`` and ``xward_point=upward``. |
| |
| Parameters |
| ---------- |
| steep_point : ndarray of shape (n_samples,), dtype=bool |
| True if the point is steep downward (upward). |
| |
| xward_point : ndarray of shape (n_samples,), dtype=bool |
| True if the point is an upward (respectively downward) point. |
| |
| start : int |
| The start of the xward region. |
| |
| min_samples : int |
| The same as the min_samples given to OPTICS. Up and down steep |
| regions can't have more then ``min_samples`` consecutive non-steep |
| points. |
| |
| Returns |
| ------- |
| index : int |
| The current index iterating over all the samples, i.e. where we are up |
| to in our search. |
| |
| end : int |
| The end of the region, which can be behind the index. The region |
| includes the ``end`` index. |
| """ |
| n_samples = len(steep_point) |
| non_xward_points = 0 |
| index = start |
| end = start |
| |
| while index < n_samples: |
| if steep_point[index]: |
| non_xward_points = 0 |
| end = index |
| elif not xward_point[index]: |
| |
| non_xward_points += 1 |
| |
| |
| if non_xward_points > min_samples: |
| break |
| else: |
| return end |
| index += 1 |
| return end |
|
|
|
|
| def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot): |
| """Update steep down areas (SDAs) using the new maximum in between (mib) |
| value, and the given complement of xi, i.e. ``1 - xi``. |
| """ |
| if np.isinf(mib): |
| return [] |
| res = [ |
| sda for sda in sdas if mib <= reachability_plot[sda["start"]] * xi_complement |
| ] |
| for sda in res: |
| sda["mib"] = max(sda["mib"], mib) |
| return res |
|
|
|
|
| def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e): |
| """Correct for predecessors. |
| |
| Applies Algorithm 2 of [1]_. |
| |
| Input parameters are ordered by the computer OPTICS ordering. |
| |
| .. [1] Schubert, Erich, Michael Gertz. |
| "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of |
| the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. |
| """ |
| while s < e: |
| if reachability_plot[s] > reachability_plot[e]: |
| return s, e |
| p_e = predecessor_plot[e] |
| for i in range(s, e): |
| if p_e == ordering[i]: |
| return s, e |
| e -= 1 |
| return None, None |
|
|
|
|
| def _xi_cluster( |
| reachability_plot, |
| predecessor_plot, |
| ordering, |
| xi, |
| min_samples, |
| min_cluster_size, |
| predecessor_correction, |
| ): |
| """Automatically extract clusters according to the Xi-steep method. |
| |
| This is rouphly an implementation of Figure 19 of the OPTICS paper. |
| |
| Parameters |
| ---------- |
| reachability_plot : array-like of shape (n_samples,) |
| The reachability plot, i.e. reachability ordered according to |
| the calculated ordering, all computed by OPTICS. |
| |
| predecessor_plot : array-like of shape (n_samples,) |
| Predecessors ordered according to the calculated ordering. |
| |
| xi : float, between 0 and 1 |
| Determines the minimum steepness on the reachability plot that |
| constitutes a cluster boundary. For example, an upwards point in the |
| reachability plot is defined by the ratio from one point to its |
| successor being at most 1-xi. |
| |
| min_samples : int > 1 |
| The same as the min_samples given to OPTICS. Up and down steep regions |
| can't have more then ``min_samples`` consecutive non-steep points. |
| |
| min_cluster_size : int > 1 |
| Minimum number of samples in an OPTICS cluster. |
| |
| predecessor_correction : bool |
| Correct clusters based on the calculated predecessors. |
| |
| Returns |
| ------- |
| clusters : ndarray of shape (n_clusters, 2) |
| The list of clusters in the form of [start, end] in each row, with all |
| indices inclusive. The clusters are ordered in a way that larger |
| clusters encompassing smaller clusters come after those smaller |
| clusters. |
| """ |
|
|
| |
| |
| |
| reachability_plot = np.hstack((reachability_plot, np.inf)) |
|
|
| xi_complement = 1 - xi |
| sdas = [] |
| clusters = [] |
| index = 0 |
| mib = 0.0 |
|
|
| |
| |
| |
| |
| with np.errstate(invalid="ignore"): |
| ratio = reachability_plot[:-1] / reachability_plot[1:] |
| steep_upward = ratio <= xi_complement |
| steep_downward = ratio >= 1 / xi_complement |
| downward = ratio > 1 |
| upward = ratio < 1 |
|
|
| |
| |
| for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)): |
| |
| |
| if steep_index < index: |
| continue |
|
|
| mib = max(mib, np.max(reachability_plot[index : steep_index + 1])) |
|
|
| |
| if steep_downward[steep_index]: |
| sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot) |
| D_start = steep_index |
| D_end = _extend_region(steep_downward, upward, D_start, min_samples) |
| D = {"start": D_start, "end": D_end, "mib": 0.0} |
| sdas.append(D) |
| index = D_end + 1 |
| mib = reachability_plot[index] |
|
|
| |
| else: |
| sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot) |
| U_start = steep_index |
| U_end = _extend_region(steep_upward, downward, U_start, min_samples) |
| index = U_end + 1 |
| mib = reachability_plot[index] |
|
|
| U_clusters = [] |
| for D in sdas: |
| c_start = D["start"] |
| c_end = U_end |
|
|
| |
| if reachability_plot[c_end + 1] * xi_complement < D["mib"]: |
| continue |
|
|
| |
| D_max = reachability_plot[D["start"]] |
| if D_max * xi_complement >= reachability_plot[c_end + 1]: |
| |
| |
| while ( |
| reachability_plot[c_start + 1] > reachability_plot[c_end + 1] |
| and c_start < D["end"] |
| ): |
| c_start += 1 |
| elif reachability_plot[c_end + 1] * xi_complement >= D_max: |
| |
| |
| |
| |
| |
| |
| while reachability_plot[c_end - 1] > D_max and c_end > U_start: |
| c_end -= 1 |
|
|
| |
| if predecessor_correction: |
| c_start, c_end = _correct_predecessor( |
| reachability_plot, predecessor_plot, ordering, c_start, c_end |
| ) |
| if c_start is None: |
| continue |
|
|
| |
| if c_end - c_start + 1 < min_cluster_size: |
| continue |
|
|
| |
| if c_start > D["end"]: |
| continue |
|
|
| |
| if c_end < U_start: |
| continue |
|
|
| U_clusters.append((c_start, c_end)) |
|
|
| |
| U_clusters.reverse() |
| clusters.extend(U_clusters) |
|
|
| return np.array(clusters) |
|
|
|
|
| def _extract_xi_labels(ordering, clusters): |
| """Extracts the labels from the clusters returned by `_xi_cluster`. |
| We rely on the fact that clusters are stored |
| with the smaller clusters coming before the larger ones. |
| |
| Parameters |
| ---------- |
| ordering : array-like of shape (n_samples,) |
| The ordering of points calculated by OPTICS |
| |
| clusters : array-like of shape (n_clusters, 2) |
| List of clusters i.e. (start, end) tuples, |
| as returned by `_xi_cluster`. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| """ |
|
|
| labels = np.full(len(ordering), -1, dtype=int) |
| label = 0 |
| for c in clusters: |
| if not np.any(labels[c[0] : (c[1] + 1)] != -1): |
| labels[c[0] : (c[1] + 1)] = label |
| label += 1 |
| labels[ordering] = labels.copy() |
| return labels |
|
|