File size: 4,395 Bytes
0f34acb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""

Модуль извлечения текстовых признаков: BM25 и TextRank

"""
import math
import re
import numpy as np
from collections import Counter
from nltk.stem.snowball import RussianStemmer
from sklearn.preprocessing import MinMaxScaler  # Изменено: RobustScaler -> MinMaxScaler


class TextFeaturesExtractor:
    """Извлечение текстовых признаков BM25 и TextRank."""

    def __init__(self):
        self.stemmer = RussianStemmer()
        self.stop_words = set([
            'и', 'в', 'во', 'на', 'с', 'со', 'к', 'по', 'из', 'за', 'у', 'о',
            'об', 'при', 'без', 'до', 'для', 'через', 'над', 'под', 'про', 'а',
            'но', 'да', 'или', 'же', 'бы', 'это', 'тот', 'та', 'так', 'все',
            'всё', 'всего', 'весь', 'этот', 'то', 'который', 'как', 'что',
            'чтобы', 'быть'
        ])

    def tokenize(self, text: str) -> list:
        """Токенизация и стемминг с удалением стоп-слов."""
        words = re.findall(r'\b[а-яА-Яa-zA-Z]{3,}\b', text.lower())
        words = [self.stemmer.stem(w) for w in words if w not in self.stop_words]
        return words

    def compute_idf(self, sentences_tokens: list) -> dict:
        """Вычисление IDF на основе документа."""
        N = len(sentences_tokens)
        doc_freq = Counter()
        for tokens in sentences_tokens:
            for w in set(tokens):
                doc_freq[w] += 1

        idf = {}
        for w, df in doc_freq.items():
            idf[w] = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
        return idf

    def compute_bm25(self, sentences_tokens: list, idf: dict, k1: float = 1.5, b: float = 0.75) -> np.ndarray:
        """Вычисление BM25 для каждого предложения."""
        N = len(sentences_tokens)
        avg_len = np.mean([len(tokens) for tokens in sentences_tokens])
        scores = []

        for tokens in sentences_tokens:
            score = 0.0
            term_freq = Counter(tokens)
            for w, tf in term_freq.items():
                if w not in idf:
                    continue
                numerator = tf * (k1 + 1)
                denominator = tf + k1 * (1 - b + b * len(tokens) / avg_len)
                score += idf[w] * numerator / denominator
            scores.append(score)

        return np.array(scores)

    def compute_textrank(self, sentences_tokens: list, threshold: float = 0.3,

                         damping: float = 0.85, iterations: int = 50) -> np.ndarray:
        """Вычисление TextRank для предложений."""
        sentences_text = [' '.join(tokens) for tokens in sentences_tokens]
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(sentences_text)
        similarity = (tfidf_matrix * tfidf_matrix.T).toarray()

        similarity[similarity < threshold] = 0
        col_sum = similarity.sum(axis=1, keepdims=True)
        col_sum[col_sum == 0] = 1
        transition = similarity / col_sum

        ranks = np.ones(len(sentences_tokens)) / len(sentences_tokens)
        for _ in range(iterations):
            ranks = (1 - damping) + damping * transition.T @ ranks

        return ranks

    def extract(self, sentences_text: list) -> tuple:
        """

        Извлечение BM25 и TextRank с нормализацией в [0, 1].



        Returns:

            (bm25_norm, textrank_norm, T_base)

        """
        tokens = [self.tokenize(text) for text in sentences_text]
        idf = self.compute_idf(tokens)
        bm25 = self.compute_bm25(tokens, idf)
        textrank = self.compute_textrank(tokens)

        # Используем MinMaxScaler для нормализации в диапазон [0, 1]
        scaler = MinMaxScaler(feature_range=(0, 1))
        bm25_norm = scaler.fit_transform(bm25.reshape(-1, 1)).flatten()
        textrank_norm = scaler.fit_transform(textrank.reshape(-1, 1)).flatten()

        T_base = (bm25_norm + textrank_norm) / 2

        return bm25_norm, textrank_norm, T_base