NLP_Homework_1 / src /embeddings_train.py
Kolesnikov Dmitry
fix: Данные для классификации
41c2e74
"""
Обучение распределённых представлений: Word2Vec (CBOW/Skip-gram), FastText (cbow/skipgram), Doc2Vec (PV-DM/PV-DBOW).
Предоставляет единый интерфейс обучения, сохранения, загрузки и базовых оценок.
"""
from __future__ import annotations
import os
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Dict, Any
import numpy as np
import pandas as pd
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
try:
from glove import Glove, Corpus
GLOVE_AVAILABLE = True
except ImportError:
try:
from glove_python import Glove, Corpus
GLOVE_AVAILABLE = True
except ImportError:
GLOVE_AVAILABLE = False
print("⚠️ GloVe не установлен. Установите: pip install glove-python-binary")
@dataclass
class TrainConfig:
model_type: str # w2v | fasttext | doc2vec | glove
vector_size: int = 300
window: int = 8
min_count: int = 2
sg: int = 1 # 0=CBOW, 1=Skip-gram для w2v/fasttext; для doc2vec игнорируется
dm: int = 1 # 1=PV-DM, 0=PV-DBOW для doc2vec
epochs: int = 10
workers: int = 4
negative: int = 5
hs: int = 0
seed: int = 42
# GloVe специфичные параметры
alpha: float = 0.75 # для GloVe
x_max: int = 100 # для GloVe
def _tokenize_corpus(texts: Iterable[str]) -> List[List[str]]:
return [simple_preprocess(t, deacc=False, min_len=1) for t in texts]
def train_word2vec(texts: Iterable[str], cfg: TrainConfig) -> Word2Vec:
sentences = _tokenize_corpus(texts)
model = Word2Vec(
vector_size=cfg.vector_size,
window=cfg.window,
min_count=cfg.min_count,
sg=cfg.sg,
workers=cfg.workers,
negative=cfg.negative,
hs=cfg.hs,
seed=cfg.seed,
)
model.build_vocab(sentences)
# Если словарь пуст из-за min_count — понижаем порог и повторяем
if len(model.wv) == 0 and cfg.min_count > 1:
model.min_count = 1
model.build_vocab(sentences, update=False)
if len(model.wv) == 0:
return model # вернем пустую модель; UI отобразит, что соседей нет
model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
return model
def train_fasttext(texts: Iterable[str], cfg: TrainConfig) -> FastText:
sentences = _tokenize_corpus(texts)
model = FastText(
vector_size=cfg.vector_size,
window=cfg.window,
min_count=cfg.min_count,
sg=cfg.sg,
workers=cfg.workers,
negative=cfg.negative,
hs=cfg.hs,
seed=cfg.seed,
)
model.build_vocab(sentences)
if len(model.wv) == 0 and cfg.min_count > 1:
model.min_count = 1
model.build_vocab(sentences, update=False)
if len(model.wv) == 0:
return model
model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
return model
def train_doc2vec(texts: Iterable[str], cfg: TrainConfig) -> Doc2Vec:
tagged = [TaggedDocument(simple_preprocess(t), [i]) for i, t in enumerate(texts)]
model = Doc2Vec(
vector_size=cfg.vector_size,
window=cfg.window,
min_count=cfg.min_count,
dm=cfg.dm,
workers=cfg.workers,
negative=cfg.negative,
hs=cfg.hs,
seed=cfg.seed,
)
model.build_vocab(tagged)
if len(model.wv) == 0 and cfg.min_count > 1:
model.min_count = 1
model.build_vocab(tagged, update=False)
if len(model.wv) == 0:
return model
model.train(tagged, total_examples=len(tagged), epochs=cfg.epochs)
return model
def train_glove(texts: Iterable[str], cfg: TrainConfig):
"""Обучает GloVe модель."""
if not GLOVE_AVAILABLE:
raise ImportError(
"GloVe не установлен. Установите: pip install glove-python-binary\n"
"Или используйте альтернативу: pip install glove-python"
)
sentences = _tokenize_corpus(texts)
# Создаем корпус для GloVe
corpus = Corpus()
corpus.fit(sentences, window=cfg.window)
# Обучаем модель
model = Glove(no_components=cfg.vector_size, learning_rate=0.05)
model.fit(corpus.matrix, epochs=cfg.epochs, no_threads=cfg.workers, verbose=True)
model.add_dictionary(corpus.dictionary)
return model
def train_model(texts: Iterable[str], cfg: TrainConfig):
t0 = time.time()
if cfg.model_type == "w2v":
model = train_word2vec(texts, cfg)
elif cfg.model_type == "fasttext":
model = train_fasttext(texts, cfg)
elif cfg.model_type == "doc2vec":
model = train_doc2vec(texts, cfg)
elif cfg.model_type == "glove":
model = train_glove(texts, cfg)
else:
raise ValueError("model_type должен быть 'w2v', 'fasttext', 'doc2vec' или 'glove'")
train_time = time.time() - t0
return model, train_time
def save_model(model, out_path: str) -> None:
Path(os.path.dirname(out_path)).mkdir(parents=True, exist_ok=True)
# GloVe имеет другой метод сохранения
if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
model.save(out_path)
else:
# Gensim модели
model.save(out_path)
def load_model(path: str):
# gensim сам определит тип по расширению/классу
from gensim.models import Word2Vec as _W2V, FastText as _FT, Doc2Vec as _D2V
try:
return _W2V.load(path)
except Exception:
pass
try:
return _FT.load(path)
except Exception:
pass
try:
return _D2V.load(path)
except Exception:
pass
# Пробуем загрузить GloVe
if GLOVE_AVAILABLE:
try:
from glove import Glove
return Glove.load(path)
except Exception:
pass
raise ValueError(f"Не удалось загрузить модель из {path}")
def evaluate_neighbors(model, test_words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]:
results: Dict[str, List[Tuple[str, float]]] = {}
# GloVe имеет другой API
if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
# GloVe модель - вычисляем ближайших соседей вручную
for w in test_words:
try:
if w in model.dictionary:
vec_w = model.word_vectors[model.dictionary[w]]
similarities = []
for word, idx in model.dictionary.items():
if word != w:
vec = model.word_vectors[idx]
sim = float(np.dot(vec_w, vec) / (np.linalg.norm(vec_w) * np.linalg.norm(vec)))
similarities.append((word, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
results[w] = similarities[:topn]
else:
results[w] = []
except:
results[w] = []
else:
# Gensim модели (Word2Vec, FastText, Doc2Vec)
kv = model.wv if hasattr(model, "wv") else model
for w in test_words:
if w in kv:
results[w] = kv.most_similar(w, topn=topn)
else:
results[w] = []
return results
def cosine_similarity(model, word_pairs: List[Tuple[str, str]]) -> List[Tuple[str, str, float]]:
out: List[Tuple[str, str, float]] = []
# GloVe имеет другой API
if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
# GloVe модель
for a, b in word_pairs:
try:
if a in model.dictionary and b in model.dictionary:
vec_a = model.word_vectors[model.dictionary[a]]
vec_b = model.word_vectors[model.dictionary[b]]
sim = float(np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)))
out.append((a, b, sim))
else:
out.append((a, b, np.nan))
except:
out.append((a, b, np.nan))
else:
# Gensim модели
kv = model.wv if hasattr(model, "wv") else model
for a, b in word_pairs:
if a in kv and b in kv:
out.append((a, b, float(kv.similarity(a, b))))
else:
out.append((a, b, np.nan))
return out
def word_analogy(model, a: str, b: str, c: str, topn: int = 10) -> List[Tuple[str, float]]:
# GloVe не имеет встроенного метода для аналогий, вычисляем вручную
if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
# GloVe модель - вычисляем аналогию вручную
try:
if all(token in model.dictionary for token in [a, b, c]):
vec_a = model.word_vectors[model.dictionary[a]]
vec_b = model.word_vectors[model.dictionary[b]]
vec_c = model.word_vectors[model.dictionary[c]]
target = vec_b - vec_a + vec_c
# Находим ближайшие векторы
similarities = []
for word, idx in model.dictionary.items():
if word not in [a, b, c]:
vec = model.word_vectors[idx]
sim = float(np.dot(target, vec) / (np.linalg.norm(target) * np.linalg.norm(vec)))
similarities.append((word, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:topn]
except:
pass
return []
else:
# Gensim модели
kv = model.wv if hasattr(model, "wv") else model
if all(token in kv for token in [a, b, c]):
return kv.most_similar(positive=[b, c], negative=[a], topn=topn)
return []
def export_training_report(cfg: TrainConfig, train_time: float, model_path: str, extra: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
data = {
"Модель": cfg.model_type,
"Размерность": cfg.vector_size,
"Окно": cfg.window,
"Min count": cfg.min_count,
"Архитектура": ("skipgram" if cfg.sg == 1 else "cbow") if cfg.model_type in {"w2v", "fasttext"} else ("pv-dm" if cfg.dm == 1 else "pv-dbow"),
"Эпохи": cfg.epochs,
"Время обучения (с)": round(train_time, 2),
"Путь": model_path,
}
if extra:
data.update(extra)
return pd.DataFrame([data])
if __name__ == "__main__":
texts = [
"Москва является столицей России.",
"Париж — столица Франции.",
"Берлин — столица Германии.",
]
cfg = TrainConfig(model_type="w2v", vector_size=100, window=5, epochs=5, sg=1)
model, tt = train_model(texts, cfg)
save_model(model, "models/sample_w2v.model")
print(evaluate_neighbors(model, ["россии", "франции"]))