import nltk from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer nltk.download('stopwords') from pymystem3 import Mystem import pandas as pd from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) def preproc_text(x): mystem = Mystem() stop_words = stopwords.words('russian') stop_words.extend([' ', ' \n', ' ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа']) tokenizer = RegexpTokenizer(r'\w+') data = ''.join(mystem.lemmatize(x)) data = tokenizer.tokenize(data) return ' '.join([word for word in data if word not in stop_words]) def predict(sample, model, label_encoder, preproc=True): if preproc: if type(sample) == pd.Series: sample = sample.parallel_apply(preproc_text) return label_encoder.inverse_transform(model.predict(sample)) elif type(sample) == bytes or type(sample) == str: sample = [preproc_text(sample)] return label_encoder.inverse_transform(model.predict(sample))[0]