""" Обучение распределённых представлений: Word2Vec (CBOW/Skip-gram), FastText (cbow/skipgram), Doc2Vec (PV-DM/PV-DBOW). Предоставляет единый интерфейс обучения, сохранения, загрузки и базовых оценок. """ from __future__ import annotations import os import time from dataclasses import dataclass from pathlib import Path from typing import Iterable, List, Optional, Tuple, Dict, Any import numpy as np import pandas as pd from gensim.models import Word2Vec, FastText, Doc2Vec from gensim.models.doc2vec import TaggedDocument from gensim.utils import simple_preprocess try: from glove import Glove, Corpus GLOVE_AVAILABLE = True except ImportError: try: from glove_python import Glove, Corpus GLOVE_AVAILABLE = True except ImportError: GLOVE_AVAILABLE = False print("⚠️ GloVe не установлен. Установите: pip install glove-python-binary") @dataclass class TrainConfig: model_type: str # w2v | fasttext | doc2vec | glove vector_size: int = 300 window: int = 8 min_count: int = 2 sg: int = 1 # 0=CBOW, 1=Skip-gram для w2v/fasttext; для doc2vec игнорируется dm: int = 1 # 1=PV-DM, 0=PV-DBOW для doc2vec epochs: int = 10 workers: int = 4 negative: int = 5 hs: int = 0 seed: int = 42 # GloVe специфичные параметры alpha: float = 0.75 # для GloVe x_max: int = 100 # для GloVe def _tokenize_corpus(texts: Iterable[str]) -> List[List[str]]: return [simple_preprocess(t, deacc=False, min_len=1) for t in texts] def train_word2vec(texts: Iterable[str], cfg: TrainConfig) -> Word2Vec: sentences = _tokenize_corpus(texts) model = Word2Vec( vector_size=cfg.vector_size, window=cfg.window, min_count=cfg.min_count, sg=cfg.sg, workers=cfg.workers, negative=cfg.negative, hs=cfg.hs, seed=cfg.seed, ) model.build_vocab(sentences) # Если словарь пуст из-за min_count — понижаем порог и повторяем if len(model.wv) == 0 and cfg.min_count > 1: model.min_count = 1 model.build_vocab(sentences, update=False) if len(model.wv) == 0: return model # вернем пустую модель; UI отобразит, что соседей нет model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs) return model def train_fasttext(texts: Iterable[str], cfg: TrainConfig) -> FastText: sentences = _tokenize_corpus(texts) model = FastText( vector_size=cfg.vector_size, window=cfg.window, min_count=cfg.min_count, sg=cfg.sg, workers=cfg.workers, negative=cfg.negative, hs=cfg.hs, seed=cfg.seed, ) model.build_vocab(sentences) if len(model.wv) == 0 and cfg.min_count > 1: model.min_count = 1 model.build_vocab(sentences, update=False) if len(model.wv) == 0: return model model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs) return model def train_doc2vec(texts: Iterable[str], cfg: TrainConfig) -> Doc2Vec: tagged = [TaggedDocument(simple_preprocess(t), [i]) for i, t in enumerate(texts)] model = Doc2Vec( vector_size=cfg.vector_size, window=cfg.window, min_count=cfg.min_count, dm=cfg.dm, workers=cfg.workers, negative=cfg.negative, hs=cfg.hs, seed=cfg.seed, ) model.build_vocab(tagged) if len(model.wv) == 0 and cfg.min_count > 1: model.min_count = 1 model.build_vocab(tagged, update=False) if len(model.wv) == 0: return model model.train(tagged, total_examples=len(tagged), epochs=cfg.epochs) return model def train_glove(texts: Iterable[str], cfg: TrainConfig): """Обучает GloVe модель.""" if not GLOVE_AVAILABLE: raise ImportError( "GloVe не установлен. Установите: pip install glove-python-binary\n" "Или используйте альтернативу: pip install glove-python" ) sentences = _tokenize_corpus(texts) # Создаем корпус для GloVe corpus = Corpus() corpus.fit(sentences, window=cfg.window) # Обучаем модель model = Glove(no_components=cfg.vector_size, learning_rate=0.05) model.fit(corpus.matrix, epochs=cfg.epochs, no_threads=cfg.workers, verbose=True) model.add_dictionary(corpus.dictionary) return model def train_model(texts: Iterable[str], cfg: TrainConfig): t0 = time.time() if cfg.model_type == "w2v": model = train_word2vec(texts, cfg) elif cfg.model_type == "fasttext": model = train_fasttext(texts, cfg) elif cfg.model_type == "doc2vec": model = train_doc2vec(texts, cfg) elif cfg.model_type == "glove": model = train_glove(texts, cfg) else: raise ValueError("model_type должен быть 'w2v', 'fasttext', 'doc2vec' или 'glove'") train_time = time.time() - t0 return model, train_time def save_model(model, out_path: str) -> None: Path(os.path.dirname(out_path)).mkdir(parents=True, exist_ok=True) # GloVe имеет другой метод сохранения if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): model.save(out_path) else: # Gensim модели model.save(out_path) def load_model(path: str): # gensim сам определит тип по расширению/классу from gensim.models import Word2Vec as _W2V, FastText as _FT, Doc2Vec as _D2V try: return _W2V.load(path) except Exception: pass try: return _FT.load(path) except Exception: pass try: return _D2V.load(path) except Exception: pass # Пробуем загрузить GloVe if GLOVE_AVAILABLE: try: from glove import Glove return Glove.load(path) except Exception: pass raise ValueError(f"Не удалось загрузить модель из {path}") def evaluate_neighbors(model, test_words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]: results: Dict[str, List[Tuple[str, float]]] = {} # GloVe имеет другой API if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): # GloVe модель - вычисляем ближайших соседей вручную for w in test_words: try: if w in model.dictionary: vec_w = model.word_vectors[model.dictionary[w]] similarities = [] for word, idx in model.dictionary.items(): if word != w: vec = model.word_vectors[idx] sim = float(np.dot(vec_w, vec) / (np.linalg.norm(vec_w) * np.linalg.norm(vec))) similarities.append((word, sim)) similarities.sort(key=lambda x: x[1], reverse=True) results[w] = similarities[:topn] else: results[w] = [] except: results[w] = [] else: # Gensim модели (Word2Vec, FastText, Doc2Vec) kv = model.wv if hasattr(model, "wv") else model for w in test_words: if w in kv: results[w] = kv.most_similar(w, topn=topn) else: results[w] = [] return results def cosine_similarity(model, word_pairs: List[Tuple[str, str]]) -> List[Tuple[str, str, float]]: out: List[Tuple[str, str, float]] = [] # GloVe имеет другой API if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): # GloVe модель for a, b in word_pairs: try: if a in model.dictionary and b in model.dictionary: vec_a = model.word_vectors[model.dictionary[a]] vec_b = model.word_vectors[model.dictionary[b]] sim = float(np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))) out.append((a, b, sim)) else: out.append((a, b, np.nan)) except: out.append((a, b, np.nan)) else: # Gensim модели kv = model.wv if hasattr(model, "wv") else model for a, b in word_pairs: if a in kv and b in kv: out.append((a, b, float(kv.similarity(a, b)))) else: out.append((a, b, np.nan)) return out def word_analogy(model, a: str, b: str, c: str, topn: int = 10) -> List[Tuple[str, float]]: # GloVe не имеет встроенного метода для аналогий, вычисляем вручную if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): # GloVe модель - вычисляем аналогию вручную try: if all(token in model.dictionary for token in [a, b, c]): vec_a = model.word_vectors[model.dictionary[a]] vec_b = model.word_vectors[model.dictionary[b]] vec_c = model.word_vectors[model.dictionary[c]] target = vec_b - vec_a + vec_c # Находим ближайшие векторы similarities = [] for word, idx in model.dictionary.items(): if word not in [a, b, c]: vec = model.word_vectors[idx] sim = float(np.dot(target, vec) / (np.linalg.norm(target) * np.linalg.norm(vec))) similarities.append((word, sim)) similarities.sort(key=lambda x: x[1], reverse=True) return similarities[:topn] except: pass return [] else: # Gensim модели kv = model.wv if hasattr(model, "wv") else model if all(token in kv for token in [a, b, c]): return kv.most_similar(positive=[b, c], negative=[a], topn=topn) return [] def export_training_report(cfg: TrainConfig, train_time: float, model_path: str, extra: Optional[Dict[str, Any]] = None) -> pd.DataFrame: data = { "Модель": cfg.model_type, "Размерность": cfg.vector_size, "Окно": cfg.window, "Min count": cfg.min_count, "Архитектура": ("skipgram" if cfg.sg == 1 else "cbow") if cfg.model_type in {"w2v", "fasttext"} else ("pv-dm" if cfg.dm == 1 else "pv-dbow"), "Эпохи": cfg.epochs, "Время обучения (с)": round(train_time, 2), "Путь": model_path, } if extra: data.update(extra) return pd.DataFrame([data]) if __name__ == "__main__": texts = [ "Москва является столицей России.", "Париж — столица Франции.", "Берлин — столица Германии.", ] cfg = TrainConfig(model_type="w2v", vector_size=100, window=5, epochs=5, sg=1) model, tt = train_model(texts, cfg) save_model(model, "models/sample_w2v.model") print(evaluate_neighbors(model, ["россии", "франции"]))