Spaces:
Sleeping
Sleeping
| #text_preprocessor.py | |
| import pandas as pd | |
| import re | |
| import string | |
| import pymorphy3 | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin): | |
| def __init__(self): | |
| self.stop_words = self.get_stopwords_list() | |
| self.morph = pymorphy3.MorphAnalyzer() | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, texts, y=None, lemmatize=True): | |
| return [self.preprocess(text, lemmatize=lemmatize) for text in texts] | |
| def get_stopwords_list(self): | |
| url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt" | |
| stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords']) | |
| return stopwords_cust | |
| def clean(self, text): | |
| text = text.lower() | |
| text = re.sub(r'http\S+', " ", text) | |
| text = re.sub(r'@\w+', ' ', text) | |
| text = re.sub(r'#\w+', ' ', text) | |
| text = re.sub(r'\d+', ' ', text) | |
| text = re.sub(r'[^\w\s,]', '', text) | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| text = re.sub(r'<.*?>', ' ', text) | |
| text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text) | |
| text = re.sub(r'[a-zA-Z]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def remove_stopwords(self, text): | |
| return ' '.join([word for word in text.split() if word not in self.stop_words]) | |
| def lemmatize(self, text): | |
| morph = self.morph | |
| lemmatized_text = '' | |
| for word in text.split(): | |
| lemmatized_text += morph.parse(word)[0].normal_form + " " | |
| return lemmatized_text | |
| def preprocess(self, text, lemmatize=True): | |
| """Общая функция обработки текста с возможностью отключить лемматизацию""" | |
| text = self.clean(text) | |
| text = self.remove_stopwords(text) | |
| if lemmatize: | |
| text = self.lemmatize(text) | |
| return text |