| from word2vec import * |
| import numpy as np |
| from sklearn.decomposition import PCA |
| from sklearn.preprocessing import StandardScaler |
| import pandas as pd |
| import gensim |
| import umap |
|
|
|
|
| def create_3d_vectors(word, time_slice, nearest_neighbours_vectors): |
| """ |
| Turn word vectors into 3D vectors |
| """ |
| model = load_word2vec_model(f'models/{time_slice}.model') |
| |
| |
| model_df = pd.DataFrame(model.wv.vectors) |
| pca_vectors = PCA(n_components=3) |
| pca_model = pca_vectors.fit_transform(model_df) |
| pca_model_df = pd.DataFrame( |
| data = pca_model, |
| columns = ['x', 'y', 'z'] |
| ) |
| pca_model_df.insert(0, 'word', model.wv.index_to_key) |
| |
| return pca_model_df |
|
|
|
|
|
|
|
|
| def create_3d_models(time_slice): |
| """ |
| Create 3D models for each time slice |
| """ |
| time_slice_model = convert_time_name_to_model(time_slice) |
| model = load_word2vec_model(f'models/{time_slice_model}.model') |
| |
| |
| model_df = pd.DataFrame(model.wv.vectors) |
| pca_vectors = PCA(n_components=3) |
| pca_model = pca_vectors.fit_transform(model_df) |
| pca_model_df = pd.DataFrame( |
| data = pca_model, |
| columns = ['x', 'y', 'z'] |
| ) |
| |
| pca_model_df.insert(0, 'word', model.wv.index_to_key) |
| |
| pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False) |
| return pca_model_df, pca_vectors |
|
|
|
|
| def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors): |
| """ |
| Turn nearest neighbours into 3D vectors |
| """ |
| model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv') |
| |
| new_data = [] |
|
|
| |
| for neighbour in nearest_neighbours_vectors: |
| word = neighbour[0] |
| cosine_sim = neighbour[3] |
| vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0] |
| |
| |
| new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d}) |
|
|
| |
| new_df = pd.DataFrame(new_data) |
|
|
| return new_df |
|
|