espa_model / text_features.py
kanslor821's picture
Upload 10 files
0f34acb verified
"""
Модуль извлечения текстовых признаков: BM25 и TextRank
"""
import math
import re
import numpy as np
from collections import Counter
from nltk.stem.snowball import RussianStemmer
from sklearn.preprocessing import MinMaxScaler # Изменено: RobustScaler -> MinMaxScaler
class TextFeaturesExtractor:
"""Извлечение текстовых признаков BM25 и TextRank."""
def __init__(self):
self.stemmer = RussianStemmer()
self.stop_words = set([
'и', 'в', 'во', 'на', 'с', 'со', 'к', 'по', 'из', 'за', 'у', 'о',
'об', 'при', 'без', 'до', 'для', 'через', 'над', 'под', 'про', 'а',
'но', 'да', 'или', 'же', 'бы', 'это', 'тот', 'та', 'так', 'все',
'всё', 'всего', 'весь', 'этот', 'то', 'который', 'как', 'что',
'чтобы', 'быть'
])
def tokenize(self, text: str) -> list:
"""Токенизация и стемминг с удалением стоп-слов."""
words = re.findall(r'\b[а-яА-Яa-zA-Z]{3,}\b', text.lower())
words = [self.stemmer.stem(w) for w in words if w not in self.stop_words]
return words
def compute_idf(self, sentences_tokens: list) -> dict:
"""Вычисление IDF на основе документа."""
N = len(sentences_tokens)
doc_freq = Counter()
for tokens in sentences_tokens:
for w in set(tokens):
doc_freq[w] += 1
idf = {}
for w, df in doc_freq.items():
idf[w] = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
return idf
def compute_bm25(self, sentences_tokens: list, idf: dict, k1: float = 1.5, b: float = 0.75) -> np.ndarray:
"""Вычисление BM25 для каждого предложения."""
N = len(sentences_tokens)
avg_len = np.mean([len(tokens) for tokens in sentences_tokens])
scores = []
for tokens in sentences_tokens:
score = 0.0
term_freq = Counter(tokens)
for w, tf in term_freq.items():
if w not in idf:
continue
numerator = tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * len(tokens) / avg_len)
score += idf[w] * numerator / denominator
scores.append(score)
return np.array(scores)
def compute_textrank(self, sentences_tokens: list, threshold: float = 0.3,
damping: float = 0.85, iterations: int = 50) -> np.ndarray:
"""Вычисление TextRank для предложений."""
sentences_text = [' '.join(tokens) for tokens in sentences_tokens]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences_text)
similarity = (tfidf_matrix * tfidf_matrix.T).toarray()
similarity[similarity < threshold] = 0
col_sum = similarity.sum(axis=1, keepdims=True)
col_sum[col_sum == 0] = 1
transition = similarity / col_sum
ranks = np.ones(len(sentences_tokens)) / len(sentences_tokens)
for _ in range(iterations):
ranks = (1 - damping) + damping * transition.T @ ranks
return ranks
def extract(self, sentences_text: list) -> tuple:
"""
Извлечение BM25 и TextRank с нормализацией в [0, 1].
Returns:
(bm25_norm, textrank_norm, T_base)
"""
tokens = [self.tokenize(text) for text in sentences_text]
idf = self.compute_idf(tokens)
bm25 = self.compute_bm25(tokens, idf)
textrank = self.compute_textrank(tokens)
# Используем MinMaxScaler для нормализации в диапазон [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
bm25_norm = scaler.fit_transform(bm25.reshape(-1, 1)).flatten()
textrank_norm = scaler.fit_transform(textrank.reshape(-1, 1)).flatten()
T_base = (bm25_norm + textrank_norm) / 2
return bm25_norm, textrank_norm, T_base