File size: 2,053 Bytes
edcd390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#text_preprocessor.py

import pandas as pd
import re
import string
import pymorphy3
from sklearn.base import BaseEstimator, TransformerMixin

class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = self.get_stopwords_list()
        self.morph = pymorphy3.MorphAnalyzer()

    def fit(self, X, y=None):
        return self

    def transform(self, texts, y=None, lemmatize=True):
        return [self.preprocess(text, lemmatize=lemmatize) for text in texts]

    def get_stopwords_list(self):
        url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
        stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords'])
        return stopwords_cust

    def clean(self, text):
        text = text.lower()
        text = re.sub(r'http\S+', " ", text)
        text = re.sub(r'@\w+', ' ', text)
        text = re.sub(r'#\w+', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        text = re.sub(r'[^\w\s,]', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text)
        text = re.sub(r'[a-zA-Z]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def remove_stopwords(self, text):
        return ' '.join([word for word in text.split() if word not in self.stop_words])

    def lemmatize(self, text):
        morph = self.morph
        lemmatized_text = ''
        for word in text.split():
            lemmatized_text += morph.parse(word)[0].normal_form + " "
        return lemmatized_text

    def preprocess(self, text, lemmatize=True):
        """Общая функция обработки текста с возможностью отключить лемматизацию"""
        text = self.clean(text)
        text = self.remove_stopwords(text)
        if lemmatize:
            text = self.lemmatize(text)
        return text