Spaces:
Configuration error
Configuration error
| import pandas as pd | |
| from sklearn.decomposition import PCA | |
| from gensim.models import KeyedVectors | |
| def load_embeddings(path, binary = False, randomPCA = False, limit = None): | |
| if randomPCA: | |
| pca = PCA(n_components=2, | |
| copy=False, | |
| whiten=False, | |
| svd_solver='randomized', | |
| iterated_power='auto' | |
| ) | |
| else: | |
| pca = PCA(n_components=2) | |
| print("--------> PATH:", path) | |
| model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit) | |
| # Cased Vocab | |
| cased_words = model.index_to_key | |
| cased_emb = model.get_normed_vectors() | |
| cased_pca = pca.fit_transform(cased_emb) | |
| df_cased = pd.DataFrame( | |
| zip( | |
| cased_words, | |
| cased_emb, | |
| cased_pca | |
| ), | |
| columns=['word', 'embedding', 'pca'] | |
| ) | |
| df_cased['word'] = df_cased.word.apply(lambda w: w.lower()) | |
| df_uncased = df_cased.drop_duplicates(subset='word') | |
| return df_uncased | |
| #load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000) |