| """ |
| Word2Vec Baseline (gensim) |
| |
| Trains a Word2Vec model on your corpus and provides the same interface |
| as the transformer engine, so you can compare results side by side. |
| |
| Key limitation: Word2Vec gives ONE static vector per word regardless of |
| context. "pizza" always has the same embedding whether it means food or school. |
| The only contextual signal comes from averaging word vectors in a sentence. |
| |
| Usage: |
| w2v = Word2VecEngine() |
| w2v.add_document("doc1", text) |
| w2v.build_index() # trains Word2Vec on your corpus |
| results = w2v.query("a place where children learn", top_k=5) |
| score = w2v.compare_texts("pizza gives me homework", "school gives me homework") |
| """ |
|
|
| import json |
| import re |
| import logging |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Optional |
|
|
| import numpy as np |
| from gensim.models import Word2Vec |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| @dataclass |
| class W2VResult: |
| """A single similarity result.""" |
| text: str |
| doc_id: str |
| score: float |
| rank: int |
|
|
|
|
| class Word2VecEngine: |
| """ |
| Word2Vec baseline for comparison with the transformer engine. |
| |
| Trains Word2Vec on your corpus, represents sentences as averaged |
| word vectors, and uses cosine similarity for matching. |
| """ |
|
|
| def __init__( |
| self, |
| vector_size: int = 100, |
| window: int = 5, |
| min_count: int = 1, |
| epochs: int = 50, |
| sg: int = 1, |
| ): |
| """ |
| Args: |
| vector_size: Dimensionality of word vectors. |
| window: Context window size. |
| min_count: Ignore words with frequency below this. |
| epochs: Training epochs. |
| sg: 1 for skip-gram, 0 for CBOW. |
| """ |
| self.vector_size = vector_size |
| self.window = window |
| self.min_count = min_count |
| self.epochs = epochs |
| self.sg = sg |
|
|
| self.model: Optional[Word2Vec] = None |
| self.sentences: list[str] = [] |
| self.sentence_docs: list[str] = [] |
| self.sentence_vecs: Optional[np.ndarray] = None |
|
|
| def add_document(self, doc_id: str, text: str) -> int: |
| """Add a document. Returns number of sentences extracted.""" |
| sents = self._split_sentences(text) |
| self.sentences.extend(sents) |
| self.sentence_docs.extend([doc_id] * len(sents)) |
| return len(sents) |
|
|
| def build_index(self) -> dict: |
| """Train Word2Vec on the corpus and compute sentence vectors.""" |
| tokenized = [self._tokenize(s) for s in self.sentences] |
|
|
| self.model = Word2Vec( |
| sentences=tokenized, |
| vector_size=self.vector_size, |
| window=self.window, |
| min_count=self.min_count, |
| epochs=self.epochs, |
| sg=self.sg, |
| workers=4, |
| ) |
|
|
| |
| vecs = [] |
| for tokens in tokenized: |
| vecs.append(self._sentence_vector(tokens)) |
| self.sentence_vecs = np.array(vecs, dtype=np.float32) |
|
|
| vocab_size = len(self.model.wv) |
| logger.info(f"Word2Vec trained: {vocab_size} words, {len(self.sentences)} sentences") |
| return { |
| "vocab_size": vocab_size, |
| "sentences": len(self.sentences), |
| "vector_size": self.vector_size, |
| } |
|
|
| def compare_texts(self, text_a: str, text_b: str) -> float: |
| """Cosine similarity between two texts (averaged word vectors).""" |
| vec_a = self._sentence_vector(self._tokenize(text_a)) |
| vec_b = self._sentence_vector(self._tokenize(text_b)) |
| return float(self._cosine(vec_a, vec_b)) |
|
|
| def query(self, text: str, top_k: int = 10) -> list[W2VResult]: |
| """Find most similar sentences to a query.""" |
| query_vec = self._sentence_vector(self._tokenize(text)) |
| scores = self.sentence_vecs @ query_vec |
| norms = np.linalg.norm(self.sentence_vecs, axis=1) * np.linalg.norm(query_vec) |
| norms[norms == 0] = 1e-10 |
| scores = scores / norms |
|
|
| top_idx = np.argsort(scores)[::-1][:top_k] |
| return [ |
| W2VResult( |
| text=self.sentences[i], |
| doc_id=self.sentence_docs[i], |
| score=float(scores[i]), |
| rank=rank + 1, |
| ) |
| for rank, i in enumerate(top_idx) |
| ] |
|
|
| def most_similar_words(self, word: str, top_k: int = 10) -> list[tuple[str, float]]: |
| """Find words most similar to a given word (static, no context).""" |
| word = word.lower() |
| if word not in self.model.wv: |
| return [] |
| return self.model.wv.most_similar(word, topn=top_k) |
|
|
| def word_similarity(self, word_a: str, word_b: str) -> float: |
| """Cosine similarity between two individual words.""" |
| a, b = word_a.lower(), word_b.lower() |
| if a not in self.model.wv or b not in self.model.wv: |
| return 0.0 |
| return float(self.model.wv.similarity(a, b)) |
|
|
| |
| |
| |
|
|
| def save(self, directory: str) -> dict: |
| """Save trained Word2Vec state to disk for later restore.""" |
| save_dir = Path(directory) |
| save_dir.mkdir(parents=True, exist_ok=True) |
|
|
| if self.model is None: |
| raise RuntimeError("Cannot save: model has not been trained yet.") |
|
|
| self.model.save(str(save_dir / "w2v.model")) |
| np.save(save_dir / "sentence_vecs.npy", self.sentence_vecs) |
|
|
| meta = { |
| "vector_size": self.vector_size, |
| "window": self.window, |
| "min_count": self.min_count, |
| "epochs": self.epochs, |
| "sg": self.sg, |
| "num_sentences": len(self.sentences), |
| "vocab_size": len(self.model.wv), |
| } |
| with open(save_dir / "w2v_meta.json", "w") as f: |
| json.dump(meta, f, indent=2) |
|
|
| |
| with open(save_dir / "w2v_sentences.json", "w") as f: |
| json.dump({"sentences": self.sentences, "sentence_docs": self.sentence_docs}, f) |
|
|
| logger.info("Word2Vec saved to %s: %d sentences, %d vocab", |
| directory, len(self.sentences), len(self.model.wv)) |
| return meta |
|
|
| @classmethod |
| def load(cls, directory: str) -> "Word2VecEngine": |
| """Load a previously saved Word2Vec state from disk.""" |
| save_dir = Path(directory) |
| if not (save_dir / "w2v_meta.json").is_file(): |
| raise FileNotFoundError(f"No saved Word2Vec state at {directory}") |
|
|
| with open(save_dir / "w2v_meta.json") as f: |
| meta = json.load(f) |
|
|
| engine = cls( |
| vector_size=meta["vector_size"], |
| window=meta["window"], |
| min_count=meta["min_count"], |
| epochs=meta["epochs"], |
| sg=meta["sg"], |
| ) |
|
|
| engine.model = Word2Vec.load(str(save_dir / "w2v.model")) |
| engine.sentence_vecs = np.load(save_dir / "sentence_vecs.npy") |
|
|
| with open(save_dir / "w2v_sentences.json") as f: |
| data = json.load(f) |
| engine.sentences = data["sentences"] |
| engine.sentence_docs = data["sentence_docs"] |
|
|
| logger.info("Word2Vec loaded from %s: %d sentences, %d vocab", |
| directory, len(engine.sentences), len(engine.model.wv)) |
| return engine |
|
|
| @staticmethod |
| def has_saved_state(directory: str) -> bool: |
| """Check if a saved Word2Vec state exists at the given directory.""" |
| return (Path(directory) / "w2v_meta.json").is_file() |
|
|
| |
|
|
| def _sentence_vector(self, tokens: list[str]) -> np.ndarray: |
| """Average word vectors for a sentence.""" |
| vecs = [self.model.wv[t] for t in tokens if t in self.model.wv] |
| if not vecs: |
| return np.zeros(self.vector_size, dtype=np.float32) |
| return np.mean(vecs, axis=0).astype(np.float32) |
|
|
| @staticmethod |
| def _cosine(a: np.ndarray, b: np.ndarray) -> float: |
| dot = np.dot(a, b) |
| norm = np.linalg.norm(a) * np.linalg.norm(b) |
| return dot / norm if norm > 0 else 0.0 |
|
|
| @staticmethod |
| def _tokenize(text: str) -> list[str]: |
| return re.findall(r"\b[a-z]+\b", text.lower()) |
|
|
| @staticmethod |
| def _split_sentences(text: str) -> list[str]: |
| parts = re.split(r"(?<=[.!?])\s+", text.strip()) |
| return [s.strip() for s in parts if len(s.split()) >= 4] |
|
|