Spaces:
Running
Running
| import logging | |
| import os | |
| from pathlib import Path | |
| import gensim.downloader as api | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| logger = logging.getLogger(__name__) | |
| class WordVectorAnalyzer: | |
| """Handles Word2Vec model loading and word vector operations""" | |
| def __init__(self, model_name: str = "word2vec-google-news-300"): | |
| self.model = self._load_model(model_name) | |
| self.vocab = set(self.model.key_to_index.keys()) | |
| def _load_model(self, model_name: str): | |
| model_path = Path(api.base_dir) / model_name | |
| if not os.path.exists(model_path): | |
| logger.info("Downloading Word2Vec model (~1.5 GB) — this only happens once...") | |
| else: | |
| logger.info("Loading cached Word2Vec model from %s", api.base_dir) | |
| model = api.load(model_name) | |
| logger.info("Model ready. Vocabulary size: %d", len(model.key_to_index)) | |
| return model | |
| def get_vector(self, word: str): | |
| return self.model[word].tolist() if word in self.vocab else None | |
| def find_similar_words(self, word: str, num_words: int = 20) -> list[str]: | |
| if word not in self.vocab: | |
| return [] | |
| similar = self.model.most_similar(word, topn=num_words) | |
| return [word] + [w for w, _ in similar] | |
| def word_analogy(self, word1: str, word2: str, word3: str) -> tuple[str | None, str | None]: | |
| """Compute word3 - word1 + word2. Returns (result, error).""" | |
| missing = [w for w in [word1, word2, word3] if w not in self.vocab] | |
| if missing: | |
| return None, f"Words not in vocabulary: {', '.join(missing)}" | |
| try: | |
| result_vector = self.model[word3] - self.model[word1] + self.model[word2] | |
| candidates = self.model.similar_by_vector(result_vector, topn=20) | |
| input_set = {word1.lower(), word2.lower(), word3.lower()} | |
| for word, _ in candidates: | |
| if word.lower() not in input_set: | |
| return word, None | |
| return None, "No valid analogy found" | |
| except Exception as e: | |
| return None, str(e) | |
| def reduce_dimensions(self, words: list[str]) -> tuple[list[str], np.ndarray]: | |
| """Reduce 300-dim word vectors to 3D via PCA.""" | |
| valid = [w for w in words if w in self.vocab] | |
| if not valid: | |
| return [], np.zeros((0, 3)) | |
| vectors = np.array([self.model[w] for w in valid]) | |
| if len(valid) < 2: | |
| return valid, np.zeros((len(valid), 3)) | |
| n_components = min(3, len(valid)) | |
| pca = PCA(n_components=n_components) | |
| reduced = pca.fit_transform(vectors) | |
| if n_components < 3: | |
| padded = np.zeros((len(valid), 3)) | |
| padded[:, :n_components] = reduced | |
| return valid, padded | |
| return valid, reduced | |