Dev Seth
init space
50aa037
import numpy as np
import pandas as pd
import torch
import pickle
class SimilaritySpace:
'''
PyTorch tensor implementation of a similarity space.
Much faster than sklearn's NearestNeighbors.
'''
def __init__(self, desc: str, feature_vectors: pd.DataFrame, num_nearest=10) -> None:
self.desc = desc
self.device = torch.device(
'cuda' if torch.cuda.is_available() else 'cpu')
print("using device", self.device)
self.idx_to_codepoint = np.array(
feature_vectors.codepoint, dtype=np.int64)
self.codepoint_to_idx = {codepoint: idx for idx,
codepoint in enumerate(self.idx_to_codepoint)}
feature_vectors = torch.tensor(
np.vstack(feature_vectors.features), dtype=torch.float32, device=self.device)
# create a pairwise distance matrix
distance_matrix = self.matrix_cosine_distance(
feature_vectors)
# calculate the num_nearest nearest neighbors for each codepoint
distances, indices = torch.topk(
distance_matrix, k=num_nearest, dim=1, largest=False)
self.distances = distances.cpu().numpy()
self.indices = indices.cpu().numpy().astype(np.int64)
for row in self.indices:
# replace every element of row of indices with the corresponding codepoint
row[:] = self.idx_to_codepoint[row]
@staticmethod
def cosine_distance(x, y) -> float:
return 1 - np.dot(x, y) / ((np.linalg.norm(x) * np.linalg.norm(y)) + 1e-6)
@staticmethod
def matrix_cosine_distance(X: torch.TensorType) -> torch.TensorType:
'''
Compute the pairwise cosine distance between all rows of X.
X is a tensor of shape (n_samples, n_features)
'''
norm = torch.norm(X, dim=1, keepdim=True)
return 1 - (X @ X.T) / (norm @ norm.T)
def topk_neighbors(self, codepoint: int, k: int):
return self.indices[self.codepoint_to_idx[codepoint]][:k+1]
def topk_distances(self, codepoint: int, k: int):
return self.distances[self.codepoint_to_idx[codepoint]][:k+1]
def set_desc(self, desc: str) -> None:
self.desc = desc
class SimHelper:
@staticmethod
def create_sim_space(desc: str, path: str, key: str = 'df', num_nearest: int = 10) -> SimilaritySpace:
'''
Creates a similarity space from a feature vector HDF file stored at `path` with key `key`.
'''
df = pd.read_hdf(path, key)
return SimilaritySpace(desc=desc, feature_vectors=df, num_nearest=num_nearest)
@staticmethod
def load_sim_space(name: str):
return pickle.load(open(name + '.pkl', 'rb'))
@staticmethod
def save_sim_space(sim_space: SimilaritySpace, name: str) -> None:
pickle.dump(sim_space, open(name + '.pkl', 'wb'))