| """ |
| This function is adapted from [pyod] by [yzhao062] |
| Original source: [https://github.com/yzhao062/pyod] |
| """ |
|
|
| from __future__ import division |
| from __future__ import print_function |
| import numpy as np |
| import math |
| from sklearn.neighbors import LocalOutlierFactor |
| from sklearn.utils.validation import check_array |
| from sklearn.utils.validation import check_is_fitted |
|
|
| from .base import BaseDetector |
| from .feature import Window |
| from ..utils.utility import invert_order |
| from ..utils.utility import zscore |
|
|
| |
| class LOF(BaseDetector): |
| """Wrapper of scikit-learn LOF Class with more functionalities. |
| Unsupervised Outlier Detection using Local Outlier Factor (LOF). |
| |
| The anomaly score of each sample is called Local Outlier Factor. |
| It measures the local deviation of density of a given sample with |
| respect to its neighbors. |
| It is local in that the anomaly score depends on how isolated the object |
| is with respect to the surrounding neighborhood. |
| More precisely, locality is given by k-nearest neighbors, whose distance |
| is used to estimate the local density. |
| By comparing the local density of a sample to the local densities of |
| its neighbors, one can identify samples that have a substantially lower |
| density than their neighbors. These are considered outliers. |
| See :cite:`breunig2000lof` for details. |
| |
| Parameters |
| ---------- |
| n_neighbors : int, optional (default=20) |
| Number of neighbors to use by default for `kneighbors` queries. |
| If n_neighbors is larger than the number of samples provided, |
| all samples will be used. |
| |
| algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional |
| Algorithm used to compute the nearest neighbors: |
| |
| - 'ball_tree' will use BallTree |
| - 'kd_tree' will use KDTree |
| - 'brute' will use a brute-force search. |
| - 'auto' will attempt to decide the most appropriate algorithm |
| based on the values passed to :meth:`fit` method. |
| |
| Note: fitting on sparse input will override the setting of |
| this parameter, using brute force. |
| |
| leaf_size : int, optional (default=30) |
| Leaf size passed to `BallTree` or `KDTree`. This can |
| affect the speed of the construction and query, as well as the memory |
| required to store the tree. The optimal value depends on the |
| nature of the problem. |
| |
| metric : string or callable, default 'minkowski' |
| metric used for the distance computation. Any metric from scikit-learn |
| or scipy.spatial.distance can be used. |
| |
| If 'precomputed', the training input X is expected to be a distance |
| matrix. |
| |
| If metric is a callable function, it is called on each |
| pair of instances (rows) and the resulting value recorded. The callable |
| should take two arrays as input and return one value indicating the |
| distance between them. This works for Scipy's metrics, but is less |
| efficient than passing the metric name as a string. |
| |
| Valid values for metric are: |
| |
| - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
| 'manhattan'] |
| |
| - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
| 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', |
| 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', |
| 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', |
| 'sqeuclidean', 'yule'] |
| |
| See the documentation for scipy.spatial.distance for details on these |
| metrics: |
| http://docs.scipy.org/doc/scipy/reference/spatial.distance.html |
| |
| p : integer, optional (default = 2) |
| Parameter for the Minkowski metric from |
| sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is |
| equivalent to using manhattan_distance (l1), and euclidean_distance |
| (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
| See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances |
| |
| metric_params : dict, optional (default = None) |
| Additional keyword arguments for the metric function. |
| |
| contamination : float in (0., 0.5), optional (default=0.1) |
| The amount of contamination of the data set, i.e. the proportion |
| of outliers in the data set. When fitting this is used to define the |
| threshold on the decision function. |
| |
| n_jobs : int, optional (default = 1) |
| The number of parallel jobs to run for neighbors search. |
| If ``-1``, then the number of jobs is set to the number of CPU cores. |
| Affects only kneighbors and kneighbors_graph methods. |
| |
| novelty : bool (default=False) |
| By default, LocalOutlierFactor is only meant to be used for outlier |
| detection (novelty=False). Set novelty to True if you want to use |
| LocalOutlierFactor for novelty detection. In this case be aware that |
| that you should only use predict, decision_function and score_samples |
| on new unseen data and not on the training set. |
| |
| Attributes |
| ---------- |
| n_neighbors_ : int |
| The actual number of neighbors used for `kneighbors` queries. |
| |
| decision_scores_ : numpy array of shape (n_samples,) |
| The outlier scores of the training data. |
| The higher, the more abnormal. Outliers tend to have higher |
| scores. This value is available once the detector is |
| fitted. |
| |
| threshold_ : float |
| The threshold is based on ``contamination``. It is the |
| ``n_samples * contamination`` most abnormal samples in |
| ``decision_scores_``. The threshold is calculated for generating |
| binary outlier labels. |
| |
| labels_ : int, either 0 or 1 |
| The binary labels of the training data. 0 stands for inliers |
| and 1 for outliers/anomalies. It is generated by applying |
| ``threshold_`` on ``decision_scores_``. |
| """ |
|
|
| def __init__(self, slidingWindow=100, sub=True, n_neighbors=20, algorithm='auto', leaf_size=30, |
| metric='minkowski', p=2, metric_params=None, |
| contamination=0.1, n_jobs=1, novelty=True, normalize=True): |
| super(LOF, self).__init__(contamination=contamination) |
|
|
| self.slidingWindow = slidingWindow |
| self.sub = sub |
| self.n_neighbors = n_neighbors |
| self.algorithm = algorithm |
| self.leaf_size = leaf_size |
| self.metric = metric |
| self.p = p |
| self.metric_params = metric_params |
| self.n_jobs = n_jobs |
| self.novelty = novelty |
| self.normalize = normalize |
|
|
| |
| def fit(self, X, y=None): |
| """Fit detector. y is ignored in unsupervised methods. |
| |
| Parameters |
| ---------- |
| X : numpy array of shape (n_samples, n_features) |
| The input samples. |
| |
| y : Ignored |
| Not used, present for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
| n_samples, n_features = X.shape |
|
|
| |
|
|
| |
| X = Window(window = self.slidingWindow).convert(X) |
| if self.normalize: |
| if n_features == 1: |
| X = zscore(X, axis=0, ddof=0) |
| else: |
| X = zscore(X, axis=1, ddof=1) |
| |
| |
| X = check_array(X) |
| self._set_n_classes(y) |
|
|
| self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, |
| algorithm=self.algorithm, |
| leaf_size=self.leaf_size, |
| metric=self.metric, |
| p=self.p, |
| metric_params=self.metric_params, |
| contamination=self.contamination, |
| n_jobs=self.n_jobs, |
| novelty=self.novelty) |
| self.detector_.fit(X=X, y=y) |
|
|
| |
| self.decision_scores_ = invert_order(self.detector_.negative_outlier_factor_) |
|
|
| |
| if self.decision_scores_.shape[0] < n_samples: |
| self.decision_scores_ = np.array([self.decision_scores_[0]]*math.ceil((self.slidingWindow-1)/2) + |
| list(self.decision_scores_) + [self.decision_scores_[-1]]*((self.slidingWindow-1)//2)) |
|
|
| self._process_decision_scores() |
| return self |
|
|
| def decision_function(self, X): |
| """Predict raw anomaly score of X using the fitted detector. |
| |
| The anomaly score of an input sample is computed based on different |
| detector algorithms. For consistency, outliers are assigned with |
| larger anomaly scores. |
| |
| Parameters |
| ---------- |
| X : numpy array of shape (n_samples, n_features) |
| The training input samples. Sparse matrices are accepted only |
| if they are supported by the base estimator. |
| |
| Returns |
| ------- |
| anomaly_scores : numpy array of shape (n_samples,) |
| The anomaly score of the input samples. |
| """ |
|
|
| check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) |
|
|
| print('self.slidingWindow: ', self.slidingWindow) |
| n_samples, n_features = X.shape |
| |
| X = Window(window = self.slidingWindow).convert(X) |
|
|
| |
| |
| try: |
| decision_scores_ = invert_order(self.detector_._score_samples(X)) |
| except AttributeError: |
| try: |
| decision_scores_ = invert_order(self.detector_._decision_function(X)) |
| except AttributeError: |
| decision_scores_ = invert_order(self.detector_.score_samples(X)) |
|
|
| |
| if decision_scores_.shape[0] < n_samples: |
| decision_scores_ = np.array([decision_scores_[0]]*math.ceil((self.slidingWindow-1)/2) + |
| list(decision_scores_) + [decision_scores_[-1]]*((self.slidingWindow-1)//2)) |
| return decision_scores_ |
|
|
| @property |
| def n_neighbors_(self): |
| """The actual number of neighbors used for kneighbors queries. |
| Decorator for scikit-learn LOF attributes. |
| """ |
| return self.detector_.n_neighbors_ |