Spaces:
Sleeping
Sleeping
| """ | |
| Снижение размерности и тематическое моделирование для классических векторных представлений. | |
| Поддерживаются: TruncatedSVD (LSA), визуализация UMAP/t-SNE, анализ объясненной дисперсии | |
| и интерпретация компонент через топ-термины. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import List, Tuple, Dict, Any, Optional | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.manifold import TSNE | |
| try: | |
| import umap # type: ignore | |
| UMAP_AVAILABLE = True | |
| except Exception: | |
| UMAP_AVAILABLE = False | |
| class SVDConfig: | |
| n_components: int = 100 | |
| random_state: int = 42 | |
| def run_lsa(X, feature_names: List[str], config: SVDConfig) -> Dict[str, Any]: | |
| """ | |
| Выполняет LSA (TruncatedSVD) и возвращает компоненты, объясненную дисперсию | |
| и топ-термины для каждой компоненты. | |
| """ | |
| svd = TruncatedSVD(n_components=config.n_components, random_state=config.random_state) | |
| X_reduced = svd.fit_transform(X) | |
| explained = svd.explained_variance_ratio_ | |
| cumulative = np.cumsum(explained) | |
| # Топ-термины на компоненту | |
| components = svd.components_ | |
| top_terms_per_component: List[List[Tuple[str, float]]] = [] | |
| for comp in components: | |
| idx = np.argsort(-np.abs(comp))[:20] | |
| top_terms_per_component.append([(feature_names[i], float(comp[i])) for i in idx]) | |
| return { | |
| "svd": svd, | |
| "X_reduced": X_reduced, | |
| "explained_variance_ratio": explained, | |
| "explained_variance_ratio_cum": cumulative, | |
| "top_terms_per_component": top_terms_per_component, | |
| } | |
| def embed_2d(X, method: str = "umap", random_state: int = 42, n_neighbors: int = 15, min_dist: float = 0.1): | |
| """Проецирует матрицу признаков/векторов в 2D для визуализации (UMAP или t-SNE).""" | |
| if method == "umap": | |
| if not UMAP_AVAILABLE: | |
| raise ImportError("umap-learn не установлен") | |
| reducer = umap.UMAP(n_components=2, random_state=random_state, n_neighbors=n_neighbors, min_dist=min_dist) | |
| return reducer.fit_transform(X) | |
| if method == "tsne": | |
| tsne = TSNE(n_components=2, random_state=random_state, init="pca", learning_rate="auto") | |
| return tsne.fit_transform(X) | |
| raise ValueError("method должен быть 'umap' или 'tsne'") | |
| def explained_variance_table(explained_ratio: np.ndarray) -> pd.DataFrame: | |
| cum = np.cumsum(explained_ratio) | |
| return pd.DataFrame({ | |
| "Компонента": np.arange(1, len(explained_ratio) + 1), | |
| "Доля дисперсии": np.round(explained_ratio, 6), | |
| "Накопленная доля": np.round(cum, 6), | |
| }) | |
| def top_terms_dataframe(top_terms: List[List[Tuple[str, float]]], top_k: int = 10) -> pd.DataFrame: | |
| rows = [] | |
| for comp_idx, terms in enumerate(top_terms): | |
| for term, weight in terms[:top_k]: | |
| rows.append({"Компонента": comp_idx + 1, "Термин": term, "Вес": float(weight)}) | |
| return pd.DataFrame(rows) | |