Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

NLP_Homework_1 / src /embeddings_train.py

Kolesnikov Dmitry

fix: Данные для классификации

41c2e74 about 1 month ago

11.7 kB

	"""
	Обучение распределённых представлений: Word2Vec (CBOW/Skip-gram), FastText (cbow/skipgram), Doc2Vec (PV-DM/PV-DBOW).
	Предоставляет единый интерфейс обучения, сохранения, загрузки и базовых оценок.
	"""

	from __future__ import annotations

	import os
	import time
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Iterable, List, Optional, Tuple, Dict, Any

	import numpy as np
	import pandas as pd
	from gensim.models import Word2Vec, FastText, Doc2Vec
	from gensim.models.doc2vec import TaggedDocument
	from gensim.utils import simple_preprocess

	try:
	from glove import Glove, Corpus
	GLOVE_AVAILABLE = True
	except ImportError:
	try:
	from glove_python import Glove, Corpus
	GLOVE_AVAILABLE = True
	except ImportError:
	GLOVE_AVAILABLE = False
	print("⚠️ GloVe не установлен. Установите: pip install glove-python-binary")


	@dataclass
	class TrainConfig:
	model_type: str # w2v \| fasttext \| doc2vec \| glove
	vector_size: int = 300
	window: int = 8
	min_count: int = 2
	sg: int = 1 # 0=CBOW, 1=Skip-gram для w2v/fasttext; для doc2vec игнорируется
	dm: int = 1 # 1=PV-DM, 0=PV-DBOW для doc2vec
	epochs: int = 10
	workers: int = 4
	negative: int = 5
	hs: int = 0
	seed: int = 42
	# GloVe специфичные параметры
	alpha: float = 0.75 # для GloVe
	x_max: int = 100 # для GloVe


	def _tokenize_corpus(texts: Iterable[str]) -> List[List[str]]:
	return [simple_preprocess(t, deacc=False, min_len=1) for t in texts]


	def train_word2vec(texts: Iterable[str], cfg: TrainConfig) -> Word2Vec:
	sentences = _tokenize_corpus(texts)
	model = Word2Vec(
	vector_size=cfg.vector_size,
	window=cfg.window,
	min_count=cfg.min_count,
	sg=cfg.sg,
	workers=cfg.workers,
	negative=cfg.negative,
	hs=cfg.hs,
	seed=cfg.seed,
	)
	model.build_vocab(sentences)
	# Если словарь пуст из-за min_count — понижаем порог и повторяем
	if len(model.wv) == 0 and cfg.min_count > 1:
	model.min_count = 1
	model.build_vocab(sentences, update=False)
	if len(model.wv) == 0:
	return model # вернем пустую модель; UI отобразит, что соседей нет
	model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
	return model


	def train_fasttext(texts: Iterable[str], cfg: TrainConfig) -> FastText:
	sentences = _tokenize_corpus(texts)
	model = FastText(
	vector_size=cfg.vector_size,
	window=cfg.window,
	min_count=cfg.min_count,
	sg=cfg.sg,
	workers=cfg.workers,
	negative=cfg.negative,
	hs=cfg.hs,
	seed=cfg.seed,
	)
	model.build_vocab(sentences)
	if len(model.wv) == 0 and cfg.min_count > 1:
	model.min_count = 1
	model.build_vocab(sentences, update=False)
	if len(model.wv) == 0:
	return model
	model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
	return model


	def train_doc2vec(texts: Iterable[str], cfg: TrainConfig) -> Doc2Vec:
	tagged = [TaggedDocument(simple_preprocess(t), [i]) for i, t in enumerate(texts)]
	model = Doc2Vec(
	vector_size=cfg.vector_size,
	window=cfg.window,
	min_count=cfg.min_count,
	dm=cfg.dm,
	workers=cfg.workers,
	negative=cfg.negative,
	hs=cfg.hs,
	seed=cfg.seed,
	)
	model.build_vocab(tagged)
	if len(model.wv) == 0 and cfg.min_count > 1:
	model.min_count = 1
	model.build_vocab(tagged, update=False)
	if len(model.wv) == 0:
	return model
	model.train(tagged, total_examples=len(tagged), epochs=cfg.epochs)
	return model


	def train_glove(texts: Iterable[str], cfg: TrainConfig):
	"""Обучает GloVe модель."""
	if not GLOVE_AVAILABLE:
	raise ImportError(
	"GloVe не установлен. Установите: pip install glove-python-binary\n"
	"Или используйте альтернативу: pip install glove-python"
	)

	sentences = _tokenize_corpus(texts)

	# Создаем корпус для GloVe
	corpus = Corpus()
	corpus.fit(sentences, window=cfg.window)

	# Обучаем модель
	model = Glove(no_components=cfg.vector_size, learning_rate=0.05)
	model.fit(corpus.matrix, epochs=cfg.epochs, no_threads=cfg.workers, verbose=True)
	model.add_dictionary(corpus.dictionary)

	return model


	def train_model(texts: Iterable[str], cfg: TrainConfig):
	t0 = time.time()
	if cfg.model_type == "w2v":
	model = train_word2vec(texts, cfg)
	elif cfg.model_type == "fasttext":
	model = train_fasttext(texts, cfg)
	elif cfg.model_type == "doc2vec":
	model = train_doc2vec(texts, cfg)
	elif cfg.model_type == "glove":
	model = train_glove(texts, cfg)
	else:
	raise ValueError("model_type должен быть 'w2v', 'fasttext', 'doc2vec' или 'glove'")
	train_time = time.time() - t0
	return model, train_time


	def save_model(model, out_path: str) -> None:
	Path(os.path.dirname(out_path)).mkdir(parents=True, exist_ok=True)
	# GloVe имеет другой метод сохранения
	if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
	model.save(out_path)
	else:
	# Gensim модели
	model.save(out_path)


	def load_model(path: str):
	# gensim сам определит тип по расширению/классу
	from gensim.models import Word2Vec as _W2V, FastText as _FT, Doc2Vec as _D2V
	try:
	return _W2V.load(path)
	except Exception:
	pass
	try:
	return _FT.load(path)
	except Exception:
	pass
	try:
	return _D2V.load(path)
	except Exception:
	pass
	# Пробуем загрузить GloVe
	if GLOVE_AVAILABLE:
	try:
	from glove import Glove
	return Glove.load(path)
	except Exception:
	pass
	raise ValueError(f"Не удалось загрузить модель из {path}")


	def evaluate_neighbors(model, test_words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]:
	results: Dict[str, List[Tuple[str, float]]] = {}
	# GloVe имеет другой API
	if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
	# GloVe модель - вычисляем ближайших соседей вручную
	for w in test_words:
	try:
	if w in model.dictionary:
	vec_w = model.word_vectors[model.dictionary[w]]
	similarities = []
	for word, idx in model.dictionary.items():
	if word != w:
	vec = model.word_vectors[idx]
	sim = float(np.dot(vec_w, vec) / (np.linalg.norm(vec_w) * np.linalg.norm(vec)))
	similarities.append((word, sim))
	similarities.sort(key=lambda x: x[1], reverse=True)
	results[w] = similarities[:topn]
	else:
	results[w] = []
	except:
	results[w] = []
	else:
	# Gensim модели (Word2Vec, FastText, Doc2Vec)
	kv = model.wv if hasattr(model, "wv") else model
	for w in test_words:
	if w in kv:
	results[w] = kv.most_similar(w, topn=topn)
	else:
	results[w] = []
	return results


	def cosine_similarity(model, word_pairs: List[Tuple[str, str]]) -> List[Tuple[str, str, float]]:
	out: List[Tuple[str, str, float]] = []
	# GloVe имеет другой API
	if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
	# GloVe модель
	for a, b in word_pairs:
	try:
	if a in model.dictionary and b in model.dictionary:
	vec_a = model.word_vectors[model.dictionary[a]]
	vec_b = model.word_vectors[model.dictionary[b]]
	sim = float(np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)))
	out.append((a, b, sim))
	else:
	out.append((a, b, np.nan))
	except:
	out.append((a, b, np.nan))
	else:
	# Gensim модели
	kv = model.wv if hasattr(model, "wv") else model
	for a, b in word_pairs:
	if a in kv and b in kv:
	out.append((a, b, float(kv.similarity(a, b))))
	else:
	out.append((a, b, np.nan))
	return out


	def word_analogy(model, a: str, b: str, c: str, topn: int = 10) -> List[Tuple[str, float]]:
	# GloVe не имеет встроенного метода для аналогий, вычисляем вручную
	if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
	# GloVe модель - вычисляем аналогию вручную
	try:
	if all(token in model.dictionary for token in [a, b, c]):
	vec_a = model.word_vectors[model.dictionary[a]]
	vec_b = model.word_vectors[model.dictionary[b]]
	vec_c = model.word_vectors[model.dictionary[c]]
	target = vec_b - vec_a + vec_c
	# Находим ближайшие векторы
	similarities = []
	for word, idx in model.dictionary.items():
	if word not in [a, b, c]:
	vec = model.word_vectors[idx]
	sim = float(np.dot(target, vec) / (np.linalg.norm(target) * np.linalg.norm(vec)))
	similarities.append((word, sim))
	similarities.sort(key=lambda x: x[1], reverse=True)
	return similarities[:topn]
	except:
	pass
	return []
	else:
	# Gensim модели
	kv = model.wv if hasattr(model, "wv") else model
	if all(token in kv for token in [a, b, c]):
	return kv.most_similar(positive=[b, c], negative=[a], topn=topn)
	return []


	def export_training_report(cfg: TrainConfig, train_time: float, model_path: str, extra: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
	data = {
	"Модель": cfg.model_type,
	"Размерность": cfg.vector_size,
	"Окно": cfg.window,
	"Min count": cfg.min_count,
	"Архитектура": ("skipgram" if cfg.sg == 1 else "cbow") if cfg.model_type in {"w2v", "fasttext"} else ("pv-dm" if cfg.dm == 1 else "pv-dbow"),
	"Эпохи": cfg.epochs,
	"Время обучения (с)": round(train_time, 2),
	"Путь": model_path,
	}
	if extra:
	data.update(extra)
	return pd.DataFrame([data])


	if __name__ == "__main__":
	texts = [
	"Москва является столицей России.",
	"Париж — столица Франции.",
	"Берлин — столица Германии.",
	]
	cfg = TrainConfig(model_type="w2v", vector_size=100, window=5, epochs=5, sg=1)
	model, tt = train_model(texts, cfg)
	save_model(model, "models/sample_w2v.model")
	print(evaluate_neighbors(model, ["россии", "франции"]))