| | from audioop import mul |
| | import numpy as np |
| | from pynndescent import NNDescent |
| | from sklearn.linear_model import LinearRegression |
| | from tqdm import tqdm |
| |
|
| |
|
| | class IntrinsicDim: |
| | def __init__(self, data, metric="euclidean"): |
| | |
| | self.data = np.unique(data.reshape(len(data), -1), axis=0) |
| |
|
| | self.metric = metric |
| | self.name = "Intrinsic Dimension" |
| |
|
| |
|
| | def find_mu(self): |
| | |
| | n_trees = min(64, 5 + int(round((self.data.shape[0]) ** 0.5 / 20.0))) |
| | |
| | n_iters = max(5, int(round(np.log2(self.data.shape[0])))) |
| | |
| | |
| | nnd = NNDescent( |
| | self.data, |
| | n_neighbors=3, |
| | metric=self.metric, |
| | n_trees=n_trees, |
| | n_iters=n_iters, |
| | max_candidates=10, |
| | verbose=False |
| | ) |
| | _, knn_dists = nnd.neighbor_graph |
| | mu = knn_dists[:, 2] / knn_dists[:, 1] |
| | return mu |
| |
|
| | def estimate_id_fast(self): |
| | mu = self.find_mu() |
| | N = self.data.shape[0] |
| | sort_idx = np.argsort(mu) |
| | Femp = np.arange(N)/N |
| | lr = LinearRegression(fit_intercept=False) |
| | lr.fit(np.log(mu[sort_idx]).reshape(-1,1), -np.log(1-Femp).reshape(-1,1)) |
| | d = lr.coef_[0][0] |
| | return d |
| | |
| | def estimate_id(self): |
| | N = self.data.shape[0] |
| | mu = np.zeros(N) |
| | for i in tqdm(range(N)): |
| | dist = np.sort(np.sqrt(np.sum((self.data[i]-self.data)**2, axis=1))) |
| | r1, r2 = dist[dist>0][:2] |
| | mu[i]=r2/r1 |
| | sort_idx = np.argsort(mu) |
| | Femp = np.arange(N)/N |
| | lr = LinearRegression(fit_intercept=False) |
| | lr.fit(np.log(mu[sort_idx]).reshape(-1,1), -np.log(1-Femp).reshape(-1,1)) |
| | d = lr.coef_[0][0] |
| |
|
| | return d |
| |
|
| | def twonn_dimension(self, return_xy=False): |
| | N = len(self.data) |
| | mu = [] |
| | for i in tqdm(range(N)): |
| | dist = np.sort(np.sqrt(np.sum((self.data[i]-self.data)**2, axis=1))) |
| | r1, r2 = dist[dist>0][:2] |
| | mu.append((i+1,r2/r1)) |
| | sigma_i = dict(zip(range(1,len(mu)+1), np.array(sorted(mu, key=lambda x: x[1]))[:,0].astype(int))) |
| | mu = dict(mu) |
| | F_i = {} |
| | for i in mu: |
| | F_i[sigma_i[i]] = i/N |
| | x = np.log([mu[i] for i in sorted(mu.keys())]) |
| | y = np.array([1-F_i[i] for i in sorted(mu.keys())]) |
| | x = x[y>0] |
| | y = y[y>0] |
| | y = -1*np.log(y) |
| | d = np.linalg.lstsq(np.vstack([x, np.zeros(len(x))]).T, y, rcond=None)[0][0] |
| | if return_xy: |
| | return d, x, y |
| | else: |
| | return d |
| |
|
| | def twonn_dimension_fast(self): |
| | N = len(self.data) |
| | mu = self.find_mu().tolist() |
| | mu = list(enumerate(mu, start=1)) |
| | sigma_i = dict(zip(range(1,len(mu)+1), np.array(sorted(mu, key=lambda x: x[1]))[:,0].astype(int))) |
| | mu = dict(mu) |
| | F_i = {} |
| | for i in mu: |
| | F_i[sigma_i[i]] = i/N |
| | x = np.log([mu[i] for i in sorted(mu.keys())]) |
| | y = np.array([1-F_i[i] for i in sorted(mu.keys())]) |
| | x = x[y>0] |
| | y = y[y>0] |
| | y = -1*np.log(y) |
| | d = np.linalg.lstsq(np.vstack([x, np.zeros(len(x))]).T, y, rcond=None)[0][0] |
| | return d |
| |
|