| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import RegexpTokenizer |
| nltk.download('stopwords') |
| from pymystem3 import Mystem |
| import pandas as pd |
| from pandarallel import pandarallel |
| pandarallel.initialize(progress_bar=True) |
|
|
|
|
| def preproc_text(x): |
| mystem = Mystem() |
| stop_words = stopwords.words('russian') |
| stop_words.extend([' ', ' \n', ' ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа']) |
| tokenizer = RegexpTokenizer(r'\w+') |
| data = ''.join(mystem.lemmatize(x)) |
| data = tokenizer.tokenize(data) |
| return ' '.join([word for word in data if word not in stop_words]) |
|
|
|
|
| def predict(sample, model, label_encoder, preproc=True): |
| if preproc: |
| if type(sample) == pd.Series: |
| sample = sample.parallel_apply(preproc_text) |
| return label_encoder.inverse_transform(model.predict(sample)) |
|
|
| elif type(sample) == bytes or type(sample) == str: |
| sample = [preproc_text(sample)] |
| return label_encoder.inverse_transform(model.predict(sample))[0] |