Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.pipeline import Pipeline, FeatureUnion | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import Normalizer | |
| import joblib | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from pymorphy2 import MorphAnalyzer | |
| import string | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| class TextPreprocessor(BaseEstimator, TransformerMixin): | |
| def __init__(self): | |
| self.stop_words = set(stopwords.words('russian')) | |
| self.morph = MorphAnalyzer() | |
| def preprocess_text(self, text): | |
| # Удаление всего, что не является буквами или знаками препинания | |
| clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') | |
| text = clean_pattern.sub('', text) | |
| url_pattern = re.compile(r'http\S+|www\S+|https\S+') | |
| text = url_pattern.sub(r'', text) | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| text = text.lower() | |
| tokens = text.split() | |
| lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words]) | |
| return lemmatized_text | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X, y=None): | |
| return X.apply(self.preprocess_text) |