word2vec-galaxy / word_vectors.py
Gayanukaa's picture
Add Word2Vec Galaxy Static app with Docker support
fb5dd49
import logging
import os
from pathlib import Path
import gensim.downloader as api
import numpy as np
from sklearn.decomposition import PCA
logger = logging.getLogger(__name__)
class WordVectorAnalyzer:
"""Handles Word2Vec model loading and word vector operations"""
def __init__(self, model_name: str = "word2vec-google-news-300"):
self.model = self._load_model(model_name)
self.vocab = set(self.model.key_to_index.keys())
def _load_model(self, model_name: str):
model_path = Path(api.base_dir) / model_name
if not os.path.exists(model_path):
logger.info("Downloading Word2Vec model (~1.5 GB) — this only happens once...")
else:
logger.info("Loading cached Word2Vec model from %s", api.base_dir)
model = api.load(model_name)
logger.info("Model ready. Vocabulary size: %d", len(model.key_to_index))
return model
def get_vector(self, word: str):
return self.model[word].tolist() if word in self.vocab else None
def find_similar_words(self, word: str, num_words: int = 20) -> list[str]:
if word not in self.vocab:
return []
similar = self.model.most_similar(word, topn=num_words)
return [word] + [w for w, _ in similar]
def word_analogy(self, word1: str, word2: str, word3: str) -> tuple[str | None, str | None]:
"""Compute word3 - word1 + word2. Returns (result, error)."""
missing = [w for w in [word1, word2, word3] if w not in self.vocab]
if missing:
return None, f"Words not in vocabulary: {', '.join(missing)}"
try:
result_vector = self.model[word3] - self.model[word1] + self.model[word2]
candidates = self.model.similar_by_vector(result_vector, topn=20)
input_set = {word1.lower(), word2.lower(), word3.lower()}
for word, _ in candidates:
if word.lower() not in input_set:
return word, None
return None, "No valid analogy found"
except Exception as e:
return None, str(e)
def reduce_dimensions(self, words: list[str]) -> tuple[list[str], np.ndarray]:
"""Reduce 300-dim word vectors to 3D via PCA."""
valid = [w for w in words if w in self.vocab]
if not valid:
return [], np.zeros((0, 3))
vectors = np.array([self.model[w] for w in valid])
if len(valid) < 2:
return valid, np.zeros((len(valid), 3))
n_components = min(3, len(valid))
pca = PCA(n_components=n_components)
reduced = pca.fit_transform(vectors)
if n_components < 3:
padded = np.zeros((len(valid), 3))
padded[:, :n_components] = reduced
return valid, padded
return valid, reduced