""" Модуль извлечения текстовых признаков: BM25 и TextRank """ import math import re import numpy as np from collections import Counter from nltk.stem.snowball import RussianStemmer from sklearn.preprocessing import MinMaxScaler # Изменено: RobustScaler -> MinMaxScaler class TextFeaturesExtractor: """Извлечение текстовых признаков BM25 и TextRank.""" def __init__(self): self.stemmer = RussianStemmer() self.stop_words = set([ 'и', 'в', 'во', 'на', 'с', 'со', 'к', 'по', 'из', 'за', 'у', 'о', 'об', 'при', 'без', 'до', 'для', 'через', 'над', 'под', 'про', 'а', 'но', 'да', 'или', 'же', 'бы', 'это', 'тот', 'та', 'так', 'все', 'всё', 'всего', 'весь', 'этот', 'то', 'который', 'как', 'что', 'чтобы', 'быть' ]) def tokenize(self, text: str) -> list: """Токенизация и стемминг с удалением стоп-слов.""" words = re.findall(r'\b[а-яА-Яa-zA-Z]{3,}\b', text.lower()) words = [self.stemmer.stem(w) for w in words if w not in self.stop_words] return words def compute_idf(self, sentences_tokens: list) -> dict: """Вычисление IDF на основе документа.""" N = len(sentences_tokens) doc_freq = Counter() for tokens in sentences_tokens: for w in set(tokens): doc_freq[w] += 1 idf = {} for w, df in doc_freq.items(): idf[w] = math.log((N - df + 0.5) / (df + 0.5) + 1.0) return idf def compute_bm25(self, sentences_tokens: list, idf: dict, k1: float = 1.5, b: float = 0.75) -> np.ndarray: """Вычисление BM25 для каждого предложения.""" N = len(sentences_tokens) avg_len = np.mean([len(tokens) for tokens in sentences_tokens]) scores = [] for tokens in sentences_tokens: score = 0.0 term_freq = Counter(tokens) for w, tf in term_freq.items(): if w not in idf: continue numerator = tf * (k1 + 1) denominator = tf + k1 * (1 - b + b * len(tokens) / avg_len) score += idf[w] * numerator / denominator scores.append(score) return np.array(scores) def compute_textrank(self, sentences_tokens: list, threshold: float = 0.3, damping: float = 0.85, iterations: int = 50) -> np.ndarray: """Вычисление TextRank для предложений.""" sentences_text = [' '.join(tokens) for tokens in sentences_tokens] from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(sentences_text) similarity = (tfidf_matrix * tfidf_matrix.T).toarray() similarity[similarity < threshold] = 0 col_sum = similarity.sum(axis=1, keepdims=True) col_sum[col_sum == 0] = 1 transition = similarity / col_sum ranks = np.ones(len(sentences_tokens)) / len(sentences_tokens) for _ in range(iterations): ranks = (1 - damping) + damping * transition.T @ ranks return ranks def extract(self, sentences_text: list) -> tuple: """ Извлечение BM25 и TextRank с нормализацией в [0, 1]. Returns: (bm25_norm, textrank_norm, T_base) """ tokens = [self.tokenize(text) for text in sentences_text] idf = self.compute_idf(tokens) bm25 = self.compute_bm25(tokens, idf) textrank = self.compute_textrank(tokens) # Используем MinMaxScaler для нормализации в диапазон [0, 1] scaler = MinMaxScaler(feature_range=(0, 1)) bm25_norm = scaler.fit_transform(bm25.reshape(-1, 1)).flatten() textrank_norm = scaler.fit_transform(textrank.reshape(-1, 1)).flatten() T_base = (bm25_norm + textrank_norm) / 2 return bm25_norm, textrank_norm, T_base