Spaces:
Sleeping
Sleeping
| """ | |
| Обучение распределённых представлений: Word2Vec (CBOW/Skip-gram), FastText (cbow/skipgram), Doc2Vec (PV-DM/PV-DBOW). | |
| Предоставляет единый интерфейс обучения, сохранения, загрузки и базовых оценок. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import time | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Iterable, List, Optional, Tuple, Dict, Any | |
| import numpy as np | |
| import pandas as pd | |
| from gensim.models import Word2Vec, FastText, Doc2Vec | |
| from gensim.models.doc2vec import TaggedDocument | |
| from gensim.utils import simple_preprocess | |
| try: | |
| from glove import Glove, Corpus | |
| GLOVE_AVAILABLE = True | |
| except ImportError: | |
| try: | |
| from glove_python import Glove, Corpus | |
| GLOVE_AVAILABLE = True | |
| except ImportError: | |
| GLOVE_AVAILABLE = False | |
| print("⚠️ GloVe не установлен. Установите: pip install glove-python-binary") | |
| class TrainConfig: | |
| model_type: str # w2v | fasttext | doc2vec | glove | |
| vector_size: int = 300 | |
| window: int = 8 | |
| min_count: int = 2 | |
| sg: int = 1 # 0=CBOW, 1=Skip-gram для w2v/fasttext; для doc2vec игнорируется | |
| dm: int = 1 # 1=PV-DM, 0=PV-DBOW для doc2vec | |
| epochs: int = 10 | |
| workers: int = 4 | |
| negative: int = 5 | |
| hs: int = 0 | |
| seed: int = 42 | |
| # GloVe специфичные параметры | |
| alpha: float = 0.75 # для GloVe | |
| x_max: int = 100 # для GloVe | |
| def _tokenize_corpus(texts: Iterable[str]) -> List[List[str]]: | |
| return [simple_preprocess(t, deacc=False, min_len=1) for t in texts] | |
| def train_word2vec(texts: Iterable[str], cfg: TrainConfig) -> Word2Vec: | |
| sentences = _tokenize_corpus(texts) | |
| model = Word2Vec( | |
| vector_size=cfg.vector_size, | |
| window=cfg.window, | |
| min_count=cfg.min_count, | |
| sg=cfg.sg, | |
| workers=cfg.workers, | |
| negative=cfg.negative, | |
| hs=cfg.hs, | |
| seed=cfg.seed, | |
| ) | |
| model.build_vocab(sentences) | |
| # Если словарь пуст из-за min_count — понижаем порог и повторяем | |
| if len(model.wv) == 0 and cfg.min_count > 1: | |
| model.min_count = 1 | |
| model.build_vocab(sentences, update=False) | |
| if len(model.wv) == 0: | |
| return model # вернем пустую модель; UI отобразит, что соседей нет | |
| model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs) | |
| return model | |
| def train_fasttext(texts: Iterable[str], cfg: TrainConfig) -> FastText: | |
| sentences = _tokenize_corpus(texts) | |
| model = FastText( | |
| vector_size=cfg.vector_size, | |
| window=cfg.window, | |
| min_count=cfg.min_count, | |
| sg=cfg.sg, | |
| workers=cfg.workers, | |
| negative=cfg.negative, | |
| hs=cfg.hs, | |
| seed=cfg.seed, | |
| ) | |
| model.build_vocab(sentences) | |
| if len(model.wv) == 0 and cfg.min_count > 1: | |
| model.min_count = 1 | |
| model.build_vocab(sentences, update=False) | |
| if len(model.wv) == 0: | |
| return model | |
| model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs) | |
| return model | |
| def train_doc2vec(texts: Iterable[str], cfg: TrainConfig) -> Doc2Vec: | |
| tagged = [TaggedDocument(simple_preprocess(t), [i]) for i, t in enumerate(texts)] | |
| model = Doc2Vec( | |
| vector_size=cfg.vector_size, | |
| window=cfg.window, | |
| min_count=cfg.min_count, | |
| dm=cfg.dm, | |
| workers=cfg.workers, | |
| negative=cfg.negative, | |
| hs=cfg.hs, | |
| seed=cfg.seed, | |
| ) | |
| model.build_vocab(tagged) | |
| if len(model.wv) == 0 and cfg.min_count > 1: | |
| model.min_count = 1 | |
| model.build_vocab(tagged, update=False) | |
| if len(model.wv) == 0: | |
| return model | |
| model.train(tagged, total_examples=len(tagged), epochs=cfg.epochs) | |
| return model | |
| def train_glove(texts: Iterable[str], cfg: TrainConfig): | |
| """Обучает GloVe модель.""" | |
| if not GLOVE_AVAILABLE: | |
| raise ImportError( | |
| "GloVe не установлен. Установите: pip install glove-python-binary\n" | |
| "Или используйте альтернативу: pip install glove-python" | |
| ) | |
| sentences = _tokenize_corpus(texts) | |
| # Создаем корпус для GloVe | |
| corpus = Corpus() | |
| corpus.fit(sentences, window=cfg.window) | |
| # Обучаем модель | |
| model = Glove(no_components=cfg.vector_size, learning_rate=0.05) | |
| model.fit(corpus.matrix, epochs=cfg.epochs, no_threads=cfg.workers, verbose=True) | |
| model.add_dictionary(corpus.dictionary) | |
| return model | |
| def train_model(texts: Iterable[str], cfg: TrainConfig): | |
| t0 = time.time() | |
| if cfg.model_type == "w2v": | |
| model = train_word2vec(texts, cfg) | |
| elif cfg.model_type == "fasttext": | |
| model = train_fasttext(texts, cfg) | |
| elif cfg.model_type == "doc2vec": | |
| model = train_doc2vec(texts, cfg) | |
| elif cfg.model_type == "glove": | |
| model = train_glove(texts, cfg) | |
| else: | |
| raise ValueError("model_type должен быть 'w2v', 'fasttext', 'doc2vec' или 'glove'") | |
| train_time = time.time() - t0 | |
| return model, train_time | |
| def save_model(model, out_path: str) -> None: | |
| Path(os.path.dirname(out_path)).mkdir(parents=True, exist_ok=True) | |
| # GloVe имеет другой метод сохранения | |
| if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): | |
| model.save(out_path) | |
| else: | |
| # Gensim модели | |
| model.save(out_path) | |
| def load_model(path: str): | |
| # gensim сам определит тип по расширению/классу | |
| from gensim.models import Word2Vec as _W2V, FastText as _FT, Doc2Vec as _D2V | |
| try: | |
| return _W2V.load(path) | |
| except Exception: | |
| pass | |
| try: | |
| return _FT.load(path) | |
| except Exception: | |
| pass | |
| try: | |
| return _D2V.load(path) | |
| except Exception: | |
| pass | |
| # Пробуем загрузить GloVe | |
| if GLOVE_AVAILABLE: | |
| try: | |
| from glove import Glove | |
| return Glove.load(path) | |
| except Exception: | |
| pass | |
| raise ValueError(f"Не удалось загрузить модель из {path}") | |
| def evaluate_neighbors(model, test_words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]: | |
| results: Dict[str, List[Tuple[str, float]]] = {} | |
| # GloVe имеет другой API | |
| if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): | |
| # GloVe модель - вычисляем ближайших соседей вручную | |
| for w in test_words: | |
| try: | |
| if w in model.dictionary: | |
| vec_w = model.word_vectors[model.dictionary[w]] | |
| similarities = [] | |
| for word, idx in model.dictionary.items(): | |
| if word != w: | |
| vec = model.word_vectors[idx] | |
| sim = float(np.dot(vec_w, vec) / (np.linalg.norm(vec_w) * np.linalg.norm(vec))) | |
| similarities.append((word, sim)) | |
| similarities.sort(key=lambda x: x[1], reverse=True) | |
| results[w] = similarities[:topn] | |
| else: | |
| results[w] = [] | |
| except: | |
| results[w] = [] | |
| else: | |
| # Gensim модели (Word2Vec, FastText, Doc2Vec) | |
| kv = model.wv if hasattr(model, "wv") else model | |
| for w in test_words: | |
| if w in kv: | |
| results[w] = kv.most_similar(w, topn=topn) | |
| else: | |
| results[w] = [] | |
| return results | |
| def cosine_similarity(model, word_pairs: List[Tuple[str, str]]) -> List[Tuple[str, str, float]]: | |
| out: List[Tuple[str, str, float]] = [] | |
| # GloVe имеет другой API | |
| if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): | |
| # GloVe модель | |
| for a, b in word_pairs: | |
| try: | |
| if a in model.dictionary and b in model.dictionary: | |
| vec_a = model.word_vectors[model.dictionary[a]] | |
| vec_b = model.word_vectors[model.dictionary[b]] | |
| sim = float(np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))) | |
| out.append((a, b, sim)) | |
| else: | |
| out.append((a, b, np.nan)) | |
| except: | |
| out.append((a, b, np.nan)) | |
| else: | |
| # Gensim модели | |
| kv = model.wv if hasattr(model, "wv") else model | |
| for a, b in word_pairs: | |
| if a in kv and b in kv: | |
| out.append((a, b, float(kv.similarity(a, b)))) | |
| else: | |
| out.append((a, b, np.nan)) | |
| return out | |
| def word_analogy(model, a: str, b: str, c: str, topn: int = 10) -> List[Tuple[str, float]]: | |
| # GloVe не имеет встроенного метода для аналогий, вычисляем вручную | |
| if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'): | |
| # GloVe модель - вычисляем аналогию вручную | |
| try: | |
| if all(token in model.dictionary for token in [a, b, c]): | |
| vec_a = model.word_vectors[model.dictionary[a]] | |
| vec_b = model.word_vectors[model.dictionary[b]] | |
| vec_c = model.word_vectors[model.dictionary[c]] | |
| target = vec_b - vec_a + vec_c | |
| # Находим ближайшие векторы | |
| similarities = [] | |
| for word, idx in model.dictionary.items(): | |
| if word not in [a, b, c]: | |
| vec = model.word_vectors[idx] | |
| sim = float(np.dot(target, vec) / (np.linalg.norm(target) * np.linalg.norm(vec))) | |
| similarities.append((word, sim)) | |
| similarities.sort(key=lambda x: x[1], reverse=True) | |
| return similarities[:topn] | |
| except: | |
| pass | |
| return [] | |
| else: | |
| # Gensim модели | |
| kv = model.wv if hasattr(model, "wv") else model | |
| if all(token in kv for token in [a, b, c]): | |
| return kv.most_similar(positive=[b, c], negative=[a], topn=topn) | |
| return [] | |
| def export_training_report(cfg: TrainConfig, train_time: float, model_path: str, extra: Optional[Dict[str, Any]] = None) -> pd.DataFrame: | |
| data = { | |
| "Модель": cfg.model_type, | |
| "Размерность": cfg.vector_size, | |
| "Окно": cfg.window, | |
| "Min count": cfg.min_count, | |
| "Архитектура": ("skipgram" if cfg.sg == 1 else "cbow") if cfg.model_type in {"w2v", "fasttext"} else ("pv-dm" if cfg.dm == 1 else "pv-dbow"), | |
| "Эпохи": cfg.epochs, | |
| "Время обучения (с)": round(train_time, 2), | |
| "Путь": model_path, | |
| } | |
| if extra: | |
| data.update(extra) | |
| return pd.DataFrame([data]) | |
| if __name__ == "__main__": | |
| texts = [ | |
| "Москва является столицей России.", | |
| "Париж — столица Франции.", | |
| "Берлин — столица Германии.", | |
| ] | |
| cfg = TrainConfig(model_type="w2v", vector_size=100, window=5, epochs=5, sg=1) | |
| model, tt = train_model(texts, cfg) | |
| save_model(model, "models/sample_w2v.model") | |
| print(evaluate_neighbors(model, ["россии", "франции"])) | |