| |
| |
|
|
| import warnings |
|
|
| import numpy as np |
| import pytest |
|
|
| from sklearn.cluster import DBSCAN, OPTICS |
| from sklearn.cluster._optics import _extend_region, _extract_xi_labels |
| from sklearn.cluster.tests.common import generate_clustered_data |
| from sklearn.datasets import make_blobs |
| from sklearn.exceptions import DataConversionWarning, EfficiencyWarning |
| from sklearn.metrics.cluster import contingency_matrix |
| from sklearn.metrics.pairwise import pairwise_distances |
| from sklearn.utils import shuffle |
| from sklearn.utils._testing import assert_allclose, assert_array_equal |
| from sklearn.utils.fixes import CSR_CONTAINERS |
|
|
| rng = np.random.RandomState(0) |
| n_points_per_cluster = 10 |
| C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2) |
| C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2) |
| C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2) |
| C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) |
| C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) |
| X = np.vstack((C1, C2, C3, C4, C5, C6)) |
|
|
|
|
| @pytest.mark.parametrize( |
| ("r_plot", "end"), |
| [ |
| [[10, 8.9, 8.8, 8.7, 7, 10], 3], |
| [[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0], |
| [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], |
| [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], |
| ], |
| ) |
| def test_extend_downward(r_plot, end): |
| r_plot = np.array(r_plot) |
| ratio = r_plot[:-1] / r_plot[1:] |
| steep_downward = ratio >= 1 / 0.9 |
| upward = ratio < 1 |
|
|
| e = _extend_region(steep_downward, upward, 0, 2) |
| assert e == end |
|
|
|
|
| @pytest.mark.parametrize( |
| ("r_plot", "end"), |
| [ |
| [[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6], |
| [[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0], |
| [[1, 2, 2.1, 2, np.inf], 0], |
| [[1, 2, 2.1, np.inf], 2], |
| ], |
| ) |
| def test_extend_upward(r_plot, end): |
| r_plot = np.array(r_plot) |
| ratio = r_plot[:-1] / r_plot[1:] |
| steep_upward = ratio <= 0.9 |
| downward = ratio > 1 |
|
|
| e = _extend_region(steep_upward, downward, 0, 2) |
| assert e == end |
|
|
|
|
| @pytest.mark.parametrize( |
| ("ordering", "clusters", "expected"), |
| [ |
| [[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]], |
| [[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]], |
| [[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]], |
| [[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]], |
| ], |
| ) |
| def test_the_extract_xi_labels(ordering, clusters, expected): |
| labels = _extract_xi_labels(ordering, clusters) |
|
|
| assert_array_equal(labels, expected) |
|
|
|
|
| def test_extract_xi(global_dtype): |
| |
| |
| rng = np.random.RandomState(0) |
| n_points_per_cluster = 5 |
|
|
| C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2) |
| C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2) |
| C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2) |
| C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2) |
| C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2) |
|
|
| X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype( |
| global_dtype, copy=False |
| ) |
| expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] |
| X, expected_labels = shuffle(X, expected_labels, random_state=rng) |
|
|
| clust = OPTICS( |
| min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4 |
| ).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
| |
| clust = OPTICS( |
| min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4 |
| ).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
| X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype( |
| global_dtype, copy=False |
| ) |
| expected_labels = np.r_[ |
| [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5 |
| ] |
| X, expected_labels = shuffle(X, expected_labels, random_state=rng) |
|
|
| clust = OPTICS( |
| min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3 |
| ).fit(X) |
| |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
| C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]] |
| C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] |
| C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] |
| X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False) |
| expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] |
| X, expected_labels = shuffle(X, expected_labels, random_state=rng) |
|
|
| clust = OPTICS( |
| min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04 |
| ).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
|
|
| def test_cluster_hierarchy_(global_dtype): |
| rng = np.random.RandomState(0) |
| n_points_per_cluster = 100 |
| C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype( |
| global_dtype, copy=False |
| ) |
| C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype( |
| global_dtype, copy=False |
| ) |
| X = np.vstack((C1, C2)) |
| X = shuffle(X, random_state=0) |
|
|
| clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_ |
| assert clusters.shape == (2, 2) |
| diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) |
| assert diff / len(X) < 0.05 |
|
|
|
|
| @pytest.mark.parametrize( |
| "csr_container, metric", |
| [(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS], |
| ) |
| def test_correct_number_of_clusters(metric, csr_container): |
| |
|
|
| n_clusters = 3 |
| X = generate_clustered_data(n_clusters=n_clusters) |
| |
| |
| clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) |
| clust.fit(csr_container(X) if csr_container is not None else X) |
| |
| n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) |
| assert n_clusters_1 == n_clusters |
|
|
| |
| assert clust.labels_.shape == (len(X),) |
| assert clust.labels_.dtype.kind == "i" |
|
|
| assert clust.reachability_.shape == (len(X),) |
| assert clust.reachability_.dtype.kind == "f" |
|
|
| assert clust.core_distances_.shape == (len(X),) |
| assert clust.core_distances_.dtype.kind == "f" |
|
|
| assert clust.ordering_.shape == (len(X),) |
| assert clust.ordering_.dtype.kind == "i" |
| assert set(clust.ordering_) == set(range(len(X))) |
|
|
|
|
| def test_minimum_number_of_sample_check(): |
| |
| msg = "min_samples must be no greater than" |
|
|
| |
| X = [[1, 1]] |
| clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0) |
|
|
| |
| with pytest.raises(ValueError, match=msg): |
| clust.fit(X) |
|
|
|
|
| def test_bad_extract(): |
| |
| msg = "Specify an epsilon smaller than 0.15. Got 0.3." |
| centers = [[1, 1], [-1, -1], [1, -1]] |
| X, labels_true = make_blobs( |
| n_samples=750, centers=centers, cluster_std=0.4, random_state=0 |
| ) |
|
|
| |
| clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10) |
| with pytest.raises(ValueError, match=msg): |
| clust.fit(X) |
|
|
|
|
| def test_bad_reachability(): |
| msg = "All reachability values are inf. Set a larger max_eps." |
| centers = [[1, 1], [-1, -1], [1, -1]] |
| X, labels_true = make_blobs( |
| n_samples=750, centers=centers, cluster_std=0.4, random_state=0 |
| ) |
|
|
| with pytest.warns(UserWarning, match=msg): |
| clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) |
| clust.fit(X) |
|
|
|
|
| def test_nowarn_if_metric_bool_data_bool(): |
| |
| |
| |
|
|
| pairwise_metric = "rogerstanimoto" |
| X = np.random.randint(2, size=(5, 2), dtype=bool) |
|
|
| with warnings.catch_warnings(): |
| warnings.simplefilter("error", DataConversionWarning) |
|
|
| OPTICS(metric=pairwise_metric).fit(X) |
|
|
|
|
| def test_warn_if_metric_bool_data_no_bool(): |
| |
| |
| |
| |
|
|
| pairwise_metric = "rogerstanimoto" |
| X = np.random.randint(2, size=(5, 2), dtype=np.int32) |
| msg = f"Data will be converted to boolean for metric {pairwise_metric}" |
|
|
| with pytest.warns(DataConversionWarning, match=msg) as warn_record: |
| OPTICS(metric=pairwise_metric).fit(X) |
| assert len(warn_record) == 1 |
|
|
|
|
| def test_nowarn_if_metric_no_bool(): |
| |
| |
| pairwise_metric = "minkowski" |
| X_bool = np.random.randint(2, size=(5, 2), dtype=bool) |
| X_num = np.random.randint(2, size=(5, 2), dtype=np.int32) |
|
|
| with warnings.catch_warnings(): |
| warnings.simplefilter("error", DataConversionWarning) |
|
|
| |
| OPTICS(metric=pairwise_metric).fit(X_bool) |
| |
| OPTICS(metric=pairwise_metric).fit(X_num) |
|
|
|
|
| def test_close_extract(): |
| |
|
|
| centers = [[1, 1], [-1, -1], [1, -1]] |
| X, labels_true = make_blobs( |
| n_samples=750, centers=centers, cluster_std=0.4, random_state=0 |
| ) |
|
|
| |
| clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X) |
| |
| assert max(clust.labels_) == 2 |
|
|
|
|
| @pytest.mark.parametrize("eps", [0.1, 0.3, 0.5]) |
| @pytest.mark.parametrize("min_samples", [3, 10, 20]) |
| @pytest.mark.parametrize( |
| "csr_container, metric", |
| [(None, "minkowski"), (None, "euclidean")] |
| + [(container, "euclidean") for container in CSR_CONTAINERS], |
| ) |
| def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container): |
| |
|
|
| centers = [[1, 1], [-1, -1], [1, -1]] |
| X, labels_true = make_blobs( |
| n_samples=150, centers=centers, cluster_std=0.4, random_state=0 |
| ) |
| X = csr_container(X) if csr_container is not None else X |
|
|
| X = X.astype(global_dtype, copy=False) |
|
|
| |
| op = OPTICS( |
| min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric |
| ).fit(X) |
|
|
| |
| db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) |
|
|
| contingency = contingency_matrix(db.labels_, op.labels_) |
| agree = min( |
| np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1)) |
| ) |
| disagree = X.shape[0] - agree |
|
|
| percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) |
|
|
| |
| assert percent_mismatch <= 0.05 |
|
|
|
|
| def test_min_samples_edge_case(global_dtype): |
| C1 = [[0, 0], [0, 0.1], [0, -0.1]] |
| C2 = [[10, 10], [10, 9], [10, 11]] |
| C3 = [[100, 100], [100, 96], [100, 106]] |
| X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False) |
|
|
| expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] |
| clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
| expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] |
| clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
| expected_labels = np.r_[[-1] * 9] |
| with pytest.warns(UserWarning, match="All reachability values"): |
| clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X) |
| assert_array_equal(clust.labels_, expected_labels) |
|
|
|
|
| |
| @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23)) |
| def test_min_cluster_size(min_cluster_size, global_dtype): |
| redX = X[::2].astype(global_dtype, copy=False) |
| clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) |
| cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) |
| if cluster_sizes.size: |
| assert min(cluster_sizes) >= min_cluster_size |
| |
| clust_frac = OPTICS( |
| min_samples=9, |
| min_cluster_size=min_cluster_size / redX.shape[0], |
| ) |
| clust_frac.fit(redX) |
| assert_array_equal(clust.labels_, clust_frac.labels_) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_min_cluster_size_invalid2(csr_container): |
| clust = OPTICS(min_cluster_size=len(X) + 1) |
| with pytest.raises(ValueError, match="must be no greater than the "): |
| clust.fit(X) |
|
|
| clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean") |
| with pytest.raises(ValueError, match="must be no greater than the "): |
| clust.fit(csr_container(X)) |
|
|
|
|
| def test_processing_order(): |
| |
| |
| Y = [[0], [10], [-10], [25]] |
|
|
| clust = OPTICS(min_samples=3, max_eps=15).fit(Y) |
| assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) |
| assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) |
| assert_array_equal(clust.ordering_, [0, 1, 2, 3]) |
|
|
|
|
| def test_compare_to_ELKI(): |
| |
| |
| |
| |
| r1 = [ |
| np.inf, |
| 1.0574896366427478, |
| 0.7587934993548423, |
| 0.7290174038973836, |
| 0.7290174038973836, |
| 0.7290174038973836, |
| 0.6861627576116127, |
| 0.7587934993548423, |
| 0.9280118450166668, |
| 1.1748022534146194, |
| 3.3355455741292257, |
| 0.49618389254482587, |
| 0.2552805046961355, |
| 0.2552805046961355, |
| 0.24944622248445714, |
| 0.24944622248445714, |
| 0.24944622248445714, |
| 0.2552805046961355, |
| 0.2552805046961355, |
| 0.3086779122185853, |
| 4.163024452756142, |
| 1.623152630340929, |
| 0.45315840475822655, |
| 0.25468325192031926, |
| 0.2254004358159971, |
| 0.18765711877083036, |
| 0.1821471333893275, |
| 0.1821471333893275, |
| 0.18765711877083036, |
| 0.18765711877083036, |
| 0.2240202988740153, |
| 1.154337614548715, |
| 1.342604473837069, |
| 1.323308536402633, |
| 0.8607514948648837, |
| 0.27219111215810565, |
| 0.13260875220533205, |
| 0.13260875220533205, |
| 0.09890587675958984, |
| 0.09890587675958984, |
| 0.13548790801634494, |
| 0.1575483940837384, |
| 0.17515137170530226, |
| 0.17575920159442388, |
| 0.27219111215810565, |
| 0.6101447895405373, |
| 1.3189208094864302, |
| 1.323308536402633, |
| 2.2509184159764577, |
| 2.4517810628594527, |
| 3.675977064404973, |
| 3.8264795626020365, |
| 2.9130735341510614, |
| 2.9130735341510614, |
| 2.9130735341510614, |
| 2.9130735341510614, |
| 2.8459300127258036, |
| 2.8459300127258036, |
| 2.8459300127258036, |
| 3.0321982337972537, |
| ] |
| o1 = [ |
| 0, |
| 3, |
| 6, |
| 4, |
| 7, |
| 8, |
| 2, |
| 9, |
| 5, |
| 1, |
| 31, |
| 30, |
| 32, |
| 34, |
| 33, |
| 38, |
| 39, |
| 35, |
| 37, |
| 36, |
| 44, |
| 21, |
| 23, |
| 24, |
| 22, |
| 25, |
| 27, |
| 29, |
| 26, |
| 28, |
| 20, |
| 40, |
| 45, |
| 46, |
| 10, |
| 15, |
| 11, |
| 13, |
| 17, |
| 19, |
| 18, |
| 12, |
| 16, |
| 14, |
| 47, |
| 49, |
| 43, |
| 48, |
| 42, |
| 41, |
| 53, |
| 57, |
| 51, |
| 52, |
| 56, |
| 59, |
| 54, |
| 55, |
| 58, |
| 50, |
| ] |
| p1 = [ |
| -1, |
| 0, |
| 3, |
| 6, |
| 6, |
| 6, |
| 8, |
| 3, |
| 7, |
| 5, |
| 1, |
| 31, |
| 30, |
| 30, |
| 34, |
| 34, |
| 34, |
| 32, |
| 32, |
| 37, |
| 36, |
| 44, |
| 21, |
| 23, |
| 24, |
| 22, |
| 25, |
| 25, |
| 22, |
| 22, |
| 22, |
| 21, |
| 40, |
| 45, |
| 46, |
| 10, |
| 15, |
| 15, |
| 13, |
| 13, |
| 15, |
| 11, |
| 19, |
| 15, |
| 10, |
| 47, |
| 12, |
| 45, |
| 14, |
| 43, |
| 42, |
| 53, |
| 57, |
| 57, |
| 57, |
| 57, |
| 59, |
| 59, |
| 59, |
| 58, |
| ] |
|
|
| |
| |
| |
| clust1 = OPTICS(min_samples=5).fit(X) |
|
|
| assert_array_equal(clust1.ordering_, np.array(o1)) |
| assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) |
| assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1)) |
| |
| |
| for i in clust1.ordering_[1:]: |
| assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]] |
|
|
| |
| r2 = [ |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| 0.27219111215810565, |
| 0.13260875220533205, |
| 0.13260875220533205, |
| 0.09890587675958984, |
| 0.09890587675958984, |
| 0.13548790801634494, |
| 0.1575483940837384, |
| 0.17515137170530226, |
| 0.17575920159442388, |
| 0.27219111215810565, |
| 0.4928068613197889, |
| np.inf, |
| 0.2666183922512113, |
| 0.18765711877083036, |
| 0.1821471333893275, |
| 0.1821471333893275, |
| 0.1821471333893275, |
| 0.18715928772277457, |
| 0.18765711877083036, |
| 0.18765711877083036, |
| 0.25468325192031926, |
| np.inf, |
| 0.2552805046961355, |
| 0.2552805046961355, |
| 0.24944622248445714, |
| 0.24944622248445714, |
| 0.24944622248445714, |
| 0.2552805046961355, |
| 0.2552805046961355, |
| 0.3086779122185853, |
| 0.34466409325984865, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| np.inf, |
| ] |
| o2 = [ |
| 0, |
| 1, |
| 2, |
| 3, |
| 4, |
| 5, |
| 6, |
| 7, |
| 8, |
| 9, |
| 10, |
| 15, |
| 11, |
| 13, |
| 17, |
| 19, |
| 18, |
| 12, |
| 16, |
| 14, |
| 47, |
| 46, |
| 20, |
| 22, |
| 25, |
| 23, |
| 27, |
| 29, |
| 24, |
| 26, |
| 28, |
| 21, |
| 30, |
| 32, |
| 34, |
| 33, |
| 38, |
| 39, |
| 35, |
| 37, |
| 36, |
| 31, |
| 40, |
| 41, |
| 42, |
| 43, |
| 44, |
| 45, |
| 48, |
| 49, |
| 50, |
| 51, |
| 52, |
| 53, |
| 54, |
| 55, |
| 56, |
| 57, |
| 58, |
| 59, |
| ] |
| p2 = [ |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| 10, |
| 15, |
| 15, |
| 13, |
| 13, |
| 15, |
| 11, |
| 19, |
| 15, |
| 10, |
| 47, |
| -1, |
| 20, |
| 22, |
| 25, |
| 25, |
| 25, |
| 25, |
| 22, |
| 22, |
| 23, |
| -1, |
| 30, |
| 30, |
| 34, |
| 34, |
| 34, |
| 32, |
| 32, |
| 37, |
| 38, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| ] |
| clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) |
|
|
| assert_array_equal(clust2.ordering_, np.array(o2)) |
| assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) |
| assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2)) |
|
|
| index = np.where(clust1.core_distances_ <= 0.5)[0] |
| assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index]) |
|
|
|
|
| def test_extract_dbscan(global_dtype): |
| |
| |
| rng = np.random.RandomState(0) |
| n_points_per_cluster = 20 |
| C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2) |
| X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False) |
|
|
| clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X) |
| assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3]) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS) |
| def test_precomputed_dists(global_dtype, csr_container): |
| redX = X[::2].astype(global_dtype, copy=False) |
| dists = pairwise_distances(redX, metric="euclidean") |
| dists = csr_container(dists) if csr_container is not None else dists |
| with warnings.catch_warnings(): |
| warnings.simplefilter("ignore", EfficiencyWarning) |
| clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit( |
| dists |
| ) |
| clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX) |
|
|
| assert_allclose(clust1.reachability_, clust2.reachability_) |
| assert_array_equal(clust1.labels_, clust2.labels_) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_optics_input_not_modified_precomputed_sparse_nodiag(csr_container): |
| """Check that we don't modify in-place the pre-computed sparse matrix. |
| Non-regression test for: |
| https://github.com/scikit-learn/scikit-learn/issues/27508 |
| """ |
| X = np.random.RandomState(0).rand(6, 6) |
| |
| |
| |
| np.fill_diagonal(X, 0) |
| X = csr_container(X) |
| assert all(row != col for row, col in zip(*X.nonzero())) |
| X_copy = X.copy() |
| OPTICS(metric="precomputed").fit(X) |
| |
| |
| assert X.nnz == X_copy.nnz |
| assert_array_equal(X.toarray(), X_copy.toarray()) |
|
|
|
|
| def test_optics_predecessor_correction_ordering(): |
| """Check that cluster correction using predecessor is working as expected. |
| |
| In the following example, the predecessor correction was not working properly |
| since it was not using the right indices. |
| |
| This non-regression test check that reordering the data does not change the results. |
| |
| Non-regression test for: |
| https://github.com/scikit-learn/scikit-learn/issues/26324 |
| """ |
| X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1) |
| reorder = [0, 1, 2, 4, 5, 6, 7, 3] |
| X_2 = X_1[reorder] |
|
|
| optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1) |
| optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2) |
|
|
| assert_array_equal(optics_1.labels_[reorder], optics_2.labels_) |
|
|