Spaces:
Sleeping
Sleeping
| """ | |
| Семантические эксперименты с эмбеддингами: косинусное сходство, аналогии, семантические оси, | |
| качественный анализ ближайших соседей и построение матриц близости. | |
| """ | |
| from __future__ import annotations | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| def cosine(u: np.ndarray, v: np.ndarray) -> float: | |
| nu = np.linalg.norm(u) | |
| nv = np.linalg.norm(v) | |
| if nu == 0 or nv == 0: | |
| return float("nan") | |
| return float(np.dot(u, v) / (nu * nv)) | |
| def pairwise_cosine_matrix(vectors: Dict[str, np.ndarray]) -> pd.DataFrame: | |
| keys = list(vectors.keys()) | |
| mat = np.zeros((len(keys), len(keys)), dtype=float) | |
| for i, ki in enumerate(keys): | |
| for j, kj in enumerate(keys): | |
| mat[i, j] = cosine(vectors[ki], vectors[kj]) | |
| return pd.DataFrame(mat, index=keys, columns=keys) | |
| def vector_arithmetic(model, expression: str, topn: int = 10) -> List[Tuple[str, float]]: | |
| """ | |
| Вычисляет выражения вида "король - мужчина + женщина" и возвращает ближайшие слова. | |
| """ | |
| kv = model.wv if hasattr(model, "wv") else model | |
| tokens = expression.replace("+", " + ").replace("-", " - ").split() | |
| positives: List[str] = [] | |
| negatives: List[str] = [] | |
| sign = 1 | |
| for tok in tokens: | |
| if tok == "+": | |
| sign = 1 | |
| elif tok == "-": | |
| sign = -1 | |
| else: | |
| if sign == 1: | |
| positives.append(tok) | |
| else: | |
| negatives.append(tok) | |
| if not positives: | |
| return [] | |
| try: | |
| return kv.most_similar(positive=positives, negative=negatives, topn=topn) | |
| except KeyError: | |
| return [] | |
| def semantic_axis(model, a: str, b: str, words: List[str]) -> pd.DataFrame: | |
| """ | |
| Строит семантическую ось (a->b) и проецирует заданные слова на эту ось. | |
| Возвращает DataFrame с координатами проекции. | |
| """ | |
| kv = model.wv if hasattr(model, "wv") else model | |
| if a not in kv or b not in kv: | |
| return pd.DataFrame(columns=["слово", "проекция"]) | |
| axis = kv[b] - kv[a] | |
| axis_norm = axis / (np.linalg.norm(axis) + 1e-9) | |
| rows = [] | |
| for w in words: | |
| if w in kv: | |
| proj = float(np.dot(kv[w], axis_norm)) | |
| else: | |
| proj = np.nan | |
| rows.append({"слово": w, "проекция": proj}) | |
| return pd.DataFrame(rows) | |
| def nearest_neighbors(model, words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]: | |
| kv = model.wv if hasattr(model, "wv") else model | |
| out: Dict[str, List[Tuple[str, float]]] = {} | |
| for w in words: | |
| if w in kv: | |
| out[w] = kv.most_similar(w, topn=topn) | |
| else: | |
| out[w] = [] | |
| return out | |