Spaces:
Sleeping
Sleeping
| """ | |
| Модуль извлечения текстовых признаков: BM25 и TextRank | |
| """ | |
| import math | |
| import re | |
| import numpy as np | |
| from collections import Counter | |
| from nltk.stem.snowball import RussianStemmer | |
| from sklearn.preprocessing import MinMaxScaler # Изменено: RobustScaler -> MinMaxScaler | |
| class TextFeaturesExtractor: | |
| """Извлечение текстовых признаков BM25 и TextRank.""" | |
| def __init__(self): | |
| self.stemmer = RussianStemmer() | |
| self.stop_words = set([ | |
| 'и', 'в', 'во', 'на', 'с', 'со', 'к', 'по', 'из', 'за', 'у', 'о', | |
| 'об', 'при', 'без', 'до', 'для', 'через', 'над', 'под', 'про', 'а', | |
| 'но', 'да', 'или', 'же', 'бы', 'это', 'тот', 'та', 'так', 'все', | |
| 'всё', 'всего', 'весь', 'этот', 'то', 'который', 'как', 'что', | |
| 'чтобы', 'быть' | |
| ]) | |
| def tokenize(self, text: str) -> list: | |
| """Токенизация и стемминг с удалением стоп-слов.""" | |
| words = re.findall(r'\b[а-яА-Яa-zA-Z]{3,}\b', text.lower()) | |
| words = [self.stemmer.stem(w) for w in words if w not in self.stop_words] | |
| return words | |
| def compute_idf(self, sentences_tokens: list) -> dict: | |
| """Вычисление IDF на основе документа.""" | |
| N = len(sentences_tokens) | |
| doc_freq = Counter() | |
| for tokens in sentences_tokens: | |
| for w in set(tokens): | |
| doc_freq[w] += 1 | |
| idf = {} | |
| for w, df in doc_freq.items(): | |
| idf[w] = math.log((N - df + 0.5) / (df + 0.5) + 1.0) | |
| return idf | |
| def compute_bm25(self, sentences_tokens: list, idf: dict, k1: float = 1.5, b: float = 0.75) -> np.ndarray: | |
| """Вычисление BM25 для каждого предложения.""" | |
| N = len(sentences_tokens) | |
| avg_len = np.mean([len(tokens) for tokens in sentences_tokens]) | |
| scores = [] | |
| for tokens in sentences_tokens: | |
| score = 0.0 | |
| term_freq = Counter(tokens) | |
| for w, tf in term_freq.items(): | |
| if w not in idf: | |
| continue | |
| numerator = tf * (k1 + 1) | |
| denominator = tf + k1 * (1 - b + b * len(tokens) / avg_len) | |
| score += idf[w] * numerator / denominator | |
| scores.append(score) | |
| return np.array(scores) | |
| def compute_textrank(self, sentences_tokens: list, threshold: float = 0.3, | |
| damping: float = 0.85, iterations: int = 50) -> np.ndarray: | |
| """Вычисление TextRank для предложений.""" | |
| sentences_text = [' '.join(tokens) for tokens in sentences_tokens] | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(sentences_text) | |
| similarity = (tfidf_matrix * tfidf_matrix.T).toarray() | |
| similarity[similarity < threshold] = 0 | |
| col_sum = similarity.sum(axis=1, keepdims=True) | |
| col_sum[col_sum == 0] = 1 | |
| transition = similarity / col_sum | |
| ranks = np.ones(len(sentences_tokens)) / len(sentences_tokens) | |
| for _ in range(iterations): | |
| ranks = (1 - damping) + damping * transition.T @ ranks | |
| return ranks | |
| def extract(self, sentences_text: list) -> tuple: | |
| """ | |
| Извлечение BM25 и TextRank с нормализацией в [0, 1]. | |
| Returns: | |
| (bm25_norm, textrank_norm, T_base) | |
| """ | |
| tokens = [self.tokenize(text) for text in sentences_text] | |
| idf = self.compute_idf(tokens) | |
| bm25 = self.compute_bm25(tokens, idf) | |
| textrank = self.compute_textrank(tokens) | |
| # Используем MinMaxScaler для нормализации в диапазон [0, 1] | |
| scaler = MinMaxScaler(feature_range=(0, 1)) | |
| bm25_norm = scaler.fit_transform(bm25.reshape(-1, 1)).flatten() | |
| textrank_norm = scaler.fit_transform(textrank.reshape(-1, 1)).flatten() | |
| T_base = (bm25_norm + textrank_norm) / 2 | |
| return bm25_norm, textrank_norm, T_base |